diff --git a/.github/README.md b/.github/README.md
index a8846260c8..528df18faf 100644
--- a/.github/README.md
+++ b/.github/README.md
@@ -72,7 +72,7 @@ ff.init(
 Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms).
 ```python
 # Specify the LLM
-llm = ff.LLM("decapoda-research/llama-7b-hf")
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
 
 # Specify a list of SSMs (just one in this case)
 ssms=[]
@@ -116,7 +116,7 @@ ff.init(
     )
 
 # Create the FlexFlow LLM
-llm = ff.LLM("decapoda-research/llama-7b-hf")
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
 
 # Create the sampling configs
 generation_config = ff.GenerationConfig(
@@ -152,7 +152,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui
 * `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0)
 * `-ll:fsize`: size of device memory on each GPU in MB
 * `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters.
-* `-llm-model`: the LLM model ID from HuggingFace (e.g. "decapoda-research/llama-7b-hf")
+* `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf")
 * `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs.
 * `-cache-folder`: the folder
 * `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. 
@@ -162,7 +162,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui
 For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference.
 
 ```bash
-./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
+./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
 ```
 </details>
 
@@ -193,7 +193,7 @@ Below is a list of models that we have explicitly tested and for which a SSM may
 
 | Model | Model id on HuggingFace | Boost-tuned SSMs |
 | :---- | :---- | :---- |
-| LLaMA-7B | decapoda-research/llama-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
 | LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
 | LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
 | LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml
index 6a18e56bd1..f4cb950931 100644
--- a/.github/workflows/gpu-ci-skip.yml
+++ b/.github/workflows/gpu-ci-skip.yml
@@ -15,7 +15,7 @@ on:
       - ".github/workflows/gpu-ci.yml"
       - "tests/cpp_gpu_tests.sh"
       - "tests/inference_tests.sh"
-      - "tests/multi_gpu_tests.sh"
+      - "tests/training_tests.sh"
       - "tests/python_interface_test.sh"
   workflow_dispatch:
 
@@ -44,8 +44,8 @@ jobs:
     steps:
       - run: 'echo "No gpu-ci required"'
 
-  gpu-ci-flexflow:
-    name: Single Machine, Multiple GPUs Tests
+  training-tests:
+    name: Training Tests
     runs-on: ubuntu-20.04
     # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}
     needs: inference-tests
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 4a43a3dee7..3901d6b5f7 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -15,7 +15,7 @@ on:
       - ".github/workflows/gpu-ci.yml"
       - "tests/cpp_gpu_tests.sh"
       - "tests/inference_tests.sh"
-      - "tests/multi_gpu_tests.sh"
+      - "tests/training_tests.sh"
       - "tests/python_interface_test.sh"
   push:
     branches:
@@ -34,7 +34,7 @@ on:
       - ".github/workflows/gpu-ci.yml"
       - "tests/cpp_gpu_tests.sh"
       - "tests/inference_tests.sh"
-      - "tests/multi_gpu_tests.sh"
+      - "tests/training_tests.sh"
       - "tests/python_interface_test.sh"
   workflow_dispatch:
 
@@ -141,7 +141,8 @@ jobs:
       run:
         shell: bash -l {0} # required to use an activated conda environment
     env: 
-      CONDA: "3"    
+      CONDA: "3"
+      HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
     needs: gpu-ci-concierge
     container:
       image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
@@ -185,7 +186,7 @@ jobs:
           export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
           
           # GPT tokenizer test
-          ./tests/gpt_tokenizer_test.sh
+          # ./tests/gpt_tokenizer_test.sh
 
           # Inference tests
           source ./build/set_python_envs.sh
@@ -209,8 +210,8 @@ jobs:
         if: always()
         run: sudo rm -rf ~/.cache 
 
-  gpu-ci-flexflow:
-    name: Single Machine, Multiple GPUs Tests
+  training-tests:
+    name: Training Tests
     runs-on: [self-hosted, gpu]
     # skip this time-consuming test for PRs to the inference branch
     # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}
@@ -266,5 +267,5 @@ jobs:
           # C++ tests
           ./tests/cpp_gpu_tests.sh 4
           # Python tests
-          ./tests/multi_gpu_tests.sh 4
+          ./tests/training_tests.sh 4
 
diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml
index ca2b47df27..226f953b38 100644
--- a/.github/workflows/multinode-test.yml
+++ b/.github/workflows/multinode-test.yml
@@ -78,7 +78,7 @@ jobs:
           export OMPI_ALLOW_RUN_AS_ROOT=1
           export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
           export OMPI_MCA_btl_vader_single_copy_mechanism=none
-          ./tests/multi_gpu_tests.sh 2 2
+          ./tests/training_tests.sh 2 2
   
   multinode-gpu-test-ucx:
     name: Multinode GPU Test with UCX
@@ -129,7 +129,7 @@ jobs:
           export OMPI_ALLOW_RUN_AS_ROOT=1
           export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
           export OMPI_MCA_btl_vader_single_copy_mechanism=none
-          ./tests/multi_gpu_tests.sh 2 2
+          ./tests/training_tests.sh 2 2
   
   multinode-gpu-test-native-ucx:
     name: Multinode GPU Test with native UCX
@@ -177,7 +177,7 @@ jobs:
           export OMPI_ALLOW_RUN_AS_ROOT=1
           export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
           export OMPI_MCA_btl_vader_single_copy_mechanism=none
-          ./tests/multi_gpu_tests.sh 2 2
+          ./tests/training_tests.sh 2 2
 
   notify-slack:
     name: Notify Slack in case of failure
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f9ce66a0f1..3732d5ff6f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,19 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
       STRING "Choose the type of build." FORCE)
 endif()
 
+if(INSTALL_DIR)
+  message(STATUS "INSTALL_DIR: ${INSTALL_DIR}")
+  set(CMAKE_INSTALL_PREFIX ${INSTALL_DIR} CACHE PATH "Installation directory" FORCE)
+else()
+  # Install DIR not set. Use default, unless a conda environment is active
+  if (DEFINED ENV{CONDA_PREFIX} AND NOT FF_BUILD_FROM_PYPI)
+    set(CONDA_PREFIX $ENV{CONDA_PREFIX})
+    # Set CMAKE_INSTALL_PREFIX to the Conda environment's installation path
+    set(CMAKE_INSTALL_PREFIX ${CONDA_PREFIX} CACHE PATH "Installation directory" FORCE)
+    message(STATUS "Active conda environment detected. Setting CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
+  endif()
+endif()
+
 # do not disable assertions even if in release mode
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG")
 
diff --git a/INSTALL.md b/INSTALL.md
index a197df24ed..1734319540 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -97,7 +97,7 @@ source ./build/set_python_envs.sh
 cd "$FF_HOME"
 ./python/flexflow_python examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize <size of gpu buffer> -ll:zsize <size of zero buffer>
 ```
-A script to run all the Python examples is available at `tests/multi_gpu_tests.sh`
+A script to run all the Python examples is available at `tests/training_tests.sh`
 
 ### Run FlexFlow C++ examples
 
diff --git a/SERVE.md b/SERVE.md
index 60d0b566f0..f6e34750cd 100644
--- a/SERVE.md
+++ b/SERVE.md
@@ -32,7 +32,7 @@ ff.init(
 Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms).
 ```python
 # Specify the LLM
-llm = ff.LLM("decapoda-research/llama-7b-hf")
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
 
 # Specify a list of SSMs (just one in this case)
 ssms=[]
@@ -78,7 +78,7 @@ ff.init(
     )
 
 # Create the FlexFlow LLM
-llm = ff.LLM("decapoda-research/llama-7b-hf")
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
 
 # Create the sampling configs
 generation_config = ff.GenerationConfig(
@@ -116,7 +116,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui
 * `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0)
 * `-ll:fsize`: size of device memory on each GPU in MB
 * `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters.
-* `-llm-model`: the LLM model ID from HuggingFace (e.g. "decapoda-research/llama-7b-hf")
+* `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf")
 * `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs.
 * `-cache-folder`: the folder
 * `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. 
@@ -126,7 +126,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui
 For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference.
 
 ```bash
-./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
+./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
 ```
 </details>
 
@@ -157,7 +157,7 @@ Below is a list of models that we have explicitly tested and for which a SSM may
 
 | Model | Model id on HuggingFace | Boost-tuned SSMs |
 | :---- | :---- | :---- |
-| LLaMA-7B | decapoda-research/llama-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
 | LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
 | LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
 | LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
diff --git a/conda/environment.yml b/conda/environment.yml
index 9ae0dc9c7a..48cd8ddb33 100644
--- a/conda/environment.yml
+++ b/conda/environment.yml
@@ -3,7 +3,7 @@ channels:
   - defaults
   - conda-forge
 dependencies:
-  - python>=3.6
+  - python>=3.6,<3.12
   - cffi>=1.11.0
   - Pillow
   - pybind11
diff --git a/conda/flexflow.yml b/conda/flexflow.yml
index 3e39407bfa..89421db758 100644
--- a/conda/flexflow.yml
+++ b/conda/flexflow.yml
@@ -3,7 +3,7 @@ channels:
   - defaults
   - conda-forge
 dependencies:
-  - python>=3.6
+  - python>=3.6,<3.12
   - cffi>=1.11.0
   - Pillow
   - pybind11
diff --git a/config/config.inc b/config/config.inc
index 7f1f0ffcf4..5a7bde5ce9 100644
--- a/config/config.inc
+++ b/config/config.inc
@@ -24,7 +24,7 @@ fi
 
 #set installation dir
 if [ -n "$INSTALL_DIR" ]; then
-  SET_INSTALL_DIR="-DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}"
+  SET_INSTALL_DIR="-DINSTALL_DIR=${INSTALL_DIR}"
 fi
 
 if [ "$INFERENCE_TESTS" = "ON" ]; then
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 6fe52e6892..fb12adf2d3 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -195,11 +195,10 @@ enum OperatorType {
 enum ModelType {
   UNKNOWN = 3001,
   LLAMA = 3002,
-  LLAMA2 = 3003,
-  OPT = 3004,
-  FALCON = 3005,
-  STARCODER = 3006,
-  MPT = 3007
+  OPT = 3003,
+  FALCON = 3004,
+  STARCODER = 3005,
+  MPT = 3006
 };
 
 enum PMParameter {
diff --git a/inference/MODEL_WEIGHTS.md b/inference/MODEL_WEIGHTS.md
index e46e6b45d1..d78fb37be9 100644
--- a/inference/MODEL_WEIGHTS.md
+++ b/inference/MODEL_WEIGHTS.md
@@ -2,7 +2,7 @@ To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we fir
 
 ```python
 from transformers import AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
 
 for name, params in model.named_parameters():
     for name, params in model.named_parameters():
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index 90d1902716..7c4cef0973 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -186,14 +186,7 @@ void FlexFlow::top_level_task(Task const *task,
   auto architectures = model_config["architectures"];
   for (auto const &str : architectures) {
     if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
-      std::string nameOrPath = model_config["_name_or_path"];
-      // TODO: support LLAMA-2 models not from Meta
-      bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0;
-      if (llama2) {
-        model_type = ModelType::LLAMA2;
-      } else {
-        model_type = ModelType::LLAMA;
-      }
+      model_type = ModelType::LLAMA;
       break;
     } else if (str == "OPTForCausalLM") {
       model_type = ModelType::OPT;
@@ -229,7 +222,7 @@ void FlexFlow::top_level_task(Task const *task,
   rm->register_output_filepath(file_paths.output_file_path);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
-  if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) {
+  if (model_type == ModelType::LLAMA) {
     LLAMA::create_llama_model(model,
                               config_filepath,
                               weights_filepath,
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index 3621ee83a3..4a146ab503 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -43,7 +43,7 @@ def get_configs():
             # required parameters
             "num_gpus": 4,
             "memory_per_gpu": 14000,
-            "zero_copy_memory_per_node": 30000,
+            "zero_copy_memory_per_node": 40000,
             # optional parameters
             "num_cpus": 4,
             "legion_utility_processors": 4,
@@ -108,7 +108,7 @@ def main():
         prompts = [s for s in json.load(open(configs.prompt))]
         results = llm.generate(prompts)
     else:
-        result = llm.generate("Here are some travel tips for Tokyo:\n")
+        result = llm.generate("Three tips for staying healthy are: ")
 
 
 if __name__ == "__main__":
diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index 3d0f1a1c0e..c9fb5cc7bb 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -43,7 +43,7 @@ def get_configs():
             # required parameters
             "num_gpus": 4,
             "memory_per_gpu": 14000,
-            "zero_copy_memory_per_node": 30000,
+            "zero_copy_memory_per_node": 40000,
             # optional parameters
             "num_cpus": 4,
             "legion_utility_processors": 4,
@@ -60,7 +60,7 @@ def get_configs():
         }
         llm_configs = {
             # required llm arguments
-            "llm_model": "decapoda-research/llama-7b-hf",
+            "llm_model": "meta-llama/Llama-2-7b-hf",
             # optional llm parameters
             "cache_path": "",
             "refresh_cache": False,
@@ -68,7 +68,7 @@ def get_configs():
             "ssms": [
                 {
                     # required ssm parameter
-                    "ssm_model": "JackFram/llama-160m-base",
+                    "ssm_model": "JackFram/llama-160m",
                     # optional ssm parameters
                     "cache_path": "",
                     "refresh_cache": False,
@@ -154,7 +154,7 @@ def main():
         prompts = [s for s in json.load(open(configs.prompt))]
         results = llm.generate(prompts)
     else:
-        result = llm.generate("Here are some travel tips for Tokyo:\n")
+        result = llm.generate("Three tips for staying healthy are: ")
 
 
 if __name__ == "__main__":
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 98b5ec4633..8b0eb926d9 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -166,14 +166,7 @@ void get_model_meta(FilePaths &file_paths,
   auto architectures = llm_model_config["architectures"];
   for (auto const &str : architectures) {
     if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
-      std::string nameOrPath = llm_model_config["_name_or_path"];
-      // TODO: support LLAMA-2 models not from Meta
-      bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0;
-      if (llama2) {
-        model_metadata.llm_model_type = ModelType::LLAMA2;
-      } else {
-        model_metadata.llm_model_type = ModelType::LLAMA;
-      }
+      model_metadata.llm_model_type = ModelType::LLAMA;
       break;
     } else if (str == "OPTForCausalLM") {
       model_metadata.llm_model_type = ModelType::OPT;
@@ -223,14 +216,7 @@ void get_model_meta(FilePaths &file_paths,
     auto architectures = ssm_model_config["architectures"];
     for (auto const &str : architectures) {
       if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
-        std::string nameOrPath = ssm_model_config["_name_or_path"];
-        // TODO: support LLAMA-2 models not from Meta
-        bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0;
-        if (llama2) {
-          ssm_model_type = ModelType::LLAMA2;
-        } else {
-          ssm_model_type = ModelType::LLAMA;
-        }
+        ssm_model_type = ModelType::LLAMA;
         break;
       } else if (str == "OPTForCausalLM") {
         ssm_model_type = ModelType::OPT;
@@ -318,8 +304,7 @@ void FlexFlow::top_level_task(Task const *task,
 
   // Create LLM model
   FFModel tree_model(ffconfig, ffconfig.cpu_offload);
-  if (model_metadata.llm_model_type == ModelType::LLAMA ||
-      model_metadata.llm_model_type == ModelType::LLAMA2) {
+  if (model_metadata.llm_model_type == ModelType::LLAMA) {
     LLAMA::create_llama_model(tree_model,
                               model_metadata.llm_model_config_path,
                               model_metadata.llm_weights_path,
@@ -363,8 +348,7 @@ void FlexFlow::top_level_task(Task const *task,
 
   for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
     FFModel &beam_model = ssm_models[ssm_id];
-    if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA ||
-        model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA2) {
+    if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA) {
       LLAMA::create_llama_model(beam_model,
                                 model_metadata.ssm_model_config_paths[ssm_id],
                                 model_metadata.ssm_model_weights_paths[ssm_id],
diff --git a/inference/utils/compress_llama_weights.py b/inference/utils/compress_llama_weights.py
index c92ae6aca9..daaee9c9d5 100644
--- a/inference/utils/compress_llama_weights.py
+++ b/inference/utils/compress_llama_weights.py
@@ -91,7 +91,7 @@ def decompress(packed_data, config):
 if __name__ == "__main__":
     # torch.set_default_tensor_type(torch.HalfTensor)
     # torch.set_default_tensor_type(torch.cuda.HalfTensor)
-    model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf")
+    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
     config = CompressionConfig(
         num_bits=8, group_size=32, group_dim=0, symmetric=False)
     for name, params in model.named_parameters():
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index e0e1b2e155..24cf9efb30 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -112,7 +112,7 @@ def __init__(
     ):
         """Create the LLM object
 
-        :param model_name: The name of the HuggingFace model to use. E.g. 'decapoda-research/llama-7b-hf'
+        :param model_name: The name of the HuggingFace model to use. E.g. 'meta-llama/Llama-2-7b-hf'
         :type model_name: str
         :param data_type: The data type to use for the tensors (e.g. DataType.DT_FLOAT for full precision, or DataType.DT_HALF for half precision), defaults to DataType.DT_HALF
         :type data_type: DataType, optional
@@ -447,7 +447,7 @@ def __init__(
     ):
         """Create the SSM object
 
-        :param model_name: The name of the HuggingFace model to use. E.g. 'decapoda-research/llama-7b-hf'
+        :param model_name: The name of the HuggingFace model to use. E.g. 'meta-llama/Llama-2-7b-hf'
         :type model_name: str
         :param data_type: The data type to use for the tensors (e.g. DataType.DT_FLOAT for full precision, or DataType.DT_HALF for half precision), defaults to DataType.DT_HALF
         :type data_type: DataType, optional
diff --git a/python/flexflow/type.py b/python/flexflow/type.py
index 9caecdde54..994a85f57e 100644
--- a/python/flexflow/type.py
+++ b/python/flexflow/type.py
@@ -75,11 +75,10 @@ class InferenceMode(Enum):
 class ModelType(Enum):
     UNKNOWN = 3001
     LLAMA = 3002
-    LLAMA2 = 3003
-    OPT = 3004
-    FALCON = 3005
-    STARCODER = 3006
-    MPT = 3007
+    OPT = 3003
+    FALCON = 3004
+    STARCODER = 3005
+    MPT = 3006
 
 
 class OpType(Enum):
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 0b89010ab1..df8d43bc38 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -117,7 +117,7 @@ void RequestManager::register_tokenizer(ModelType type,
   this->eos_token_id = eos_token_id;
   std::string tokenizer_folder =
       (!path.empty() && path.back() != '/') ? path + '/' : path;
-  if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) {
+  if (model_type == ModelType::LLAMA) {
     bool path_to_file = !path.empty() &&
                         (path.size() >= strlen("tokenizer.model")) &&
                         path.find("tokenizer.model") ==
@@ -492,6 +492,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                           old_bc.requestsInfo[i].request_guid,
                           request.tokens.size());
         std::string output = this->tokenizer_->Decode(request.tokens);
+        // Unlike Huggingface, the sentencepiece C++ library automatically
+        // removes the BOS token
+        if (model_type == ModelType::LLAMA &&
+            request.tokens.at(0) == bos_token_id) {
+          output = "<s> " + output;
+        }
 
         {
           // update generation result and trigger future
@@ -743,6 +749,12 @@ BeamSearchBatchConfig
                           request.guid,
                           request.tokens.size());
         std::string output = this->tokenizer_->Decode(request.tokens);
+        // Unlike Huggingface, the sentencepiece C++ library automatically
+        // removes the BOS token
+        if (model_type == ModelType::LLAMA &&
+            request.tokens.at(0) == bos_token_id) {
+          output = "<s> " + output;
+        }
         {
           // update generation result and trigger future
           GenerationResult &gr = request_generation_results[request.guid];
@@ -854,6 +866,12 @@ BeamSearchBatchConfig
           }
         }
         std::string output = this->tokenizer_->Decode(request.tokens);
+        // Unlike Huggingface, the sentencepiece C++ library automatically
+        // removes the BOS token
+        if (model_type == ModelType::LLAMA &&
+            request.tokens.at(0) == bos_token_id) {
+          output = "<s> " + output;
+        }
         log_req_mgr.print("Output: %s", output.c_str());
       }
     } else if (request.status == Request::PENDING) {
@@ -887,6 +905,12 @@ BeamSearchBatchConfig
 
       // Token Info
       std::string output = this->tokenizer_->Decode(request.tokens);
+      // Unlike Huggingface, the sentencepiece C++ library automatically removes
+      // the BOS token
+      if (model_type == ModelType::LLAMA &&
+          request.tokens.at(0) == bos_token_id) {
+        output = "<s> " + output;
+      }
       log_req_mgr.print("Output: %s", output.c_str());
     } else {
       assert(false);
diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh
index 42a6db09d8..8beea55999 100755
--- a/tests/inference/cpp_inference_tests.sh
+++ b/tests/inference/cpp_inference_tests.sh
@@ -10,9 +10,9 @@ cd "${BASH_SOURCE[0]%/*}"
 ###############################################################################################
 
 # LLAMA
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
 # LLAMA (half precision)
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
 
 # OPT
 ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4
@@ -22,9 +22,9 @@ cd "${BASH_SOURCE[0]%/*}"
 # Tensor parallelism tests
 if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
     # LLAMA
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # LLAMA (half precision)
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     
     # OPT
     ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
@@ -37,14 +37,17 @@ fi
 ###############################################################################################
 
 # LLAMA (small model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
+
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 1
+
 # LLAMA (small model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
 
 # LLAMA (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
 # LLAMA (big model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4
 
 # OPT (small model)
 ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4
@@ -57,9 +60,9 @@ fi
 ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4
 
 # Falcon (full precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
 # Falcon (half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
 
 # # StarCoder (full precision)
 # ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4
@@ -69,16 +72,16 @@ fi
 # Tensor parallelism tests
 if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
     # LLAMA (small model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
     # LLAMA (small model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
 
     # LLAMA (big model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # LLAMA (big model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 
     # OPT (small model)
     ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
@@ -179,22 +182,22 @@ function compare_decoding_steps_spec_infer_incr_decoding {
 
 ############ Alignment between speculative inference and incremental decoding #################
 # Full precision
-diff <(tail -n +3 "../../inference/output/incr_decoding_llama_7B.txt") <(tail -n +3 "../../inference/output/spec_inference_llama.txt")
+diff <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B.txt") <(tail -n +3 "../../inference/output/spec_inference_llama.txt")
 diff <(tail -n +3 "../../inference/output/incr_decoding_opt_6B.txt")   <(tail -n +3 "../../inference/output/spec_inference_opt.txt")
 # Half precision
-check_partial_token_match "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt"
+check_partial_token_match "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt"
 check_partial_token_match "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt"
 
 # Speed test: speculative inference should be at very least 1.5x faster than incremental decoding
 # Full precision
-#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B.txt" "../../inference/output/spec_inference_llama.txt"
+#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B.txt" "../../inference/output/spec_inference_llama.txt"
 #compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B.txt" "../../inference/output/spec_inference_opt.txt"
-compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B.txt" "../../inference/output/spec_inference_llama.txt"
+compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B.txt" "../../inference/output/spec_inference_llama.txt"
 compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B.txt" "../../inference/output/spec_inference_opt.txt"
 # Half precision
-#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt"
+#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt"
 #compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt"
-compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt"
+compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt"
 compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt"
 
 ############ Alignment between tensor model parallelism and pipeline parallelism only #################
@@ -205,8 +208,8 @@ if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
     check_partial_token_match "../../inference/output/spec_inference_opt_half_tp.txt" "../../inference/output/spec_inference_opt_half.txt"
     diff <(tail -n +3 "../../inference/output/incr_decoding_llama_160M_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_160M.txt")
     check_partial_token_match "../../inference/output/incr_decoding_llama_160M_half_tp.txt" "../../inference/output/incr_decoding_llama_160M_half.txt"
-    diff <(tail -n +3 "../../inference/output/incr_decoding_llama_7B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_7B.txt")
-    check_partial_token_match "../../inference/output/incr_decoding_llama_7B_half_tp.txt" "../../inference/output/incr_decoding_llama_7B_half.txt"
+    diff <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B.txt")
+    check_partial_token_match "../../inference/output/incr_decoding_llama_2_7B_half_tp.txt" "../../inference/output/incr_decoding_llama_2_7B_half.txt"
     diff <(tail -n +3 "../../inference/output/incr_decoding_opt_125M_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_opt_125M.txt")
     check_partial_token_match "../../inference/output/incr_decoding_opt_125M_half_tp.txt" "../../inference/output/incr_decoding_opt_125M_half.txt"
     diff <(tail -n +3 "../../inference/output/incr_decoding_opt_6B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_opt_6B.txt")
@@ -216,16 +219,16 @@ fi
 ######################### Alignment tests with HuggingFace ####################################
 
 # LLAMA (small model, full precision)
-python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu
+python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu
 
 # LLAMA (small model, half precision)
-python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu
+python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu
 
 # LLAMA (big model, full precision)
-python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt"
+python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_2_7B.txt"
 
 # LLAMA (big model, half precision)
-python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu
+python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_2_7B_half.txt" --gpu
 
 # OPT (small model, full precision)
 python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128
@@ -243,14 +246,14 @@ python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-p
 python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128
 
 
-diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_160M.txt")
-diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20)
-diff <(tail -n +2 "../../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_7B.txt")
-diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +4 "../../inference/output/incr_decoding_llama_160M.txt")
+diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_llama_2_7B.txt" <(tail -n +4 "../../inference/output/incr_decoding_llama_2_7B.txt")
+diff <( < ../../inference/output/huggingface_llama_2_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_llama_2_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20)
 
-diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_125M.txt")
-diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20)
-diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B.txt")
-# diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B_half.txt")
-diff <(tail -n +2 "../../inference/output/huggingface_falcon_7B.txt") <(tail -n +5 "../../inference/output/incr_decoding_falcon_7B.txt")
+diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_125M.txt")
+diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_6B.txt")
+# diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_6B_half.txt")
+diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +4 "../../inference/output/incr_decoding_falcon_7B.txt")
 
diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py
index fee215f4c4..5b533bf3c0 100644
--- a/tests/inference/huggingface_inference.py
+++ b/tests/inference/huggingface_inference.py
@@ -2,7 +2,14 @@
 import json
 import os
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoConfig,
+    LlamaTokenizer,
+    GenerationConfig,
+)
+
 
 def main():
     # Change working dir to folder storing this script
@@ -19,6 +26,7 @@ def main():
     parser.add_argument(
         "--use-full-precision", action="store_true", help="Use full precision"
     )
+    parser.add_argument("--do-sample", action="store_true", help="Use sampling")
     parser.add_argument("--gpu", action="store_true", help="Run on GPU")
     args = parser.parse_args()
     # Check if max-length is greater than 0
@@ -54,13 +62,19 @@ def main():
         tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True)
     else:
         tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    generation_config = GenerationConfig.from_pretrained(args.model_name)
+    generation_config.do_sample = args.do_sample
     # Generate output
     with open(args.output_file, "w") as f:
         for i, prompt in enumerate(prompt_list):
-            batch = tokenizer(
-                prompt, return_tensors="pt", add_special_tokens=True
-            ).to(device)
-            generated = model.generate(batch["input_ids"], max_length=args.max_length)
+            batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to(
+                device
+            )
+            generated = model.generate(
+                batch["input_ids"],
+                max_length=args.max_length,
+                generation_config=generation_config,
+            )
             out = tokenizer.decode(generated[0])
             # Write output to file
             out_str = out if i == (len(prompt_list) - 1) else out + "\n"
diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh
index 64c61ba0dc..3544f58e26 100755
--- a/tests/inference/python_inference_tests.sh
+++ b/tests/inference/python_inference_tests.sh
@@ -108,40 +108,40 @@ function compare_decoding_steps_spec_infer_incr_decoding {
 
 ############ Alignment between speculative inference and incremental decoding #################
 # Full precision
-diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt")
+diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt")
 diff <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt")   <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt")
 # Half precision
-check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt"
+check_partial_token_match "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt"
 check_partial_token_match "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt"
 
 # Speed test: speculative inference should be at very least 1.5x faster than incremental decoding
 # Full precision
-compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt"
+compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt"
 compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt"
 # Half precision
-compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt"
+compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt"
 compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt"
 
 ############ Alignment between tensor model parallelism and pipeline parallelism only #################
 ## Specinfer
 # LLAMA
-diff <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt")
-check_partial_token_match "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt"
+diff <(tail -n +3 "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt")
+check_partial_token_match "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt"
 # OPT
 diff <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-2_tp_2_pp.txt")  <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt")
 check_partial_token_match "../../inference/output/spec_infer-python-opt-6.7b-half_prec-2_tp_2_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt"
 
 ## Incremental decoding
 # Small LLAMA
-diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt")
-check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-base-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt"
-diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt")
-check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-base-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt"
+diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt")
+check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt"
+diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt")
+check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt"
 # Big LLAMA
-diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt")
-check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt"
-#diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt")
-#check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt"
+diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt")
+check_partial_token_match "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt"
+#diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt")
+#check_partial_token_match "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt"
 # Small OPT
 diff <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt")
 check_partial_token_match "../../inference/output/incr_dec-python-opt-125m-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt"
@@ -157,16 +157,16 @@ check_partial_token_match "../../inference/output/incr_dec-python-opt-6.7b-half_
 ######################### Alignment tests with HuggingFace ####################################
 
 # LLAMA (small model, full precision)
-python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu
+python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu
 
 # LLAMA (small model, half precision)
-python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu
+python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu
 
 # LLAMA (big model, full precision)
-python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt"
+python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt"
 
 # LLAMA (big model, half precision)
-python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu
+python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu
 
 # OPT (small model, full precision)
 python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128
@@ -183,13 +183,13 @@ python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-p
 # Falcon (full precision)
 python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128
 
-diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt")
-diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
-diff <(tail -n +2 "../../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt")
-diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt")
+diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_llama_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt")
+diff <( < ../../inference/output/huggingface_llama_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
 
-diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt")
-diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
-diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt")
-#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt")
-diff <(tail -n +2 "../../inference/output/huggingface_falcon_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-falcon-7b-half_prec-1_tp_4_pp.txt")
+diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt")
+diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt")
+#diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt")
+diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt")
diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index 8efe8999c4..ebaadade32 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -6,7 +6,7 @@
     # required parameters
     "num_gpus": 4,
     "memory_per_gpu": 14000,
-    "zero_copy_memory_per_node": 30000,
+    "zero_copy_memory_per_node": 40000,
     # optional parameters
     "num_cpus": 4,
     "legion_utility_processors": 4,
@@ -35,7 +35,7 @@
     "ssms": [
         {
             # required ssm parameter
-            "ssm_model": "JackFram/llama-160m-base",
+            "ssm_model": "JackFram/llama-160m",
             # optional ssm parameters
             "cache_path": "",
             "refresh_cache": False,
@@ -47,12 +47,16 @@
 ff_init_configs.update(llm_configs)
 
 # Test parameters to fill in
-llama_models = ["decapoda-research/llama-7b-hf", "JackFram/llama-160m-base"]
+llama_models = ["meta-llama/Llama-2-7b-hf", "JackFram/llama-160m"]
 opt_models = ["facebook/opt-6.7b", "facebook/opt-125m"]
-falcon_models = ["tiiuae/falcon-7b",]
-mpt_models = ["mosaicml/mpt-7b", ]
+falcon_models = [
+    "tiiuae/falcon-7b",
+]
+mpt_models = [
+    "mosaicml/mpt-7b",
+]
 # starcoder_models = ["bigcode/starcoderbase-7b",]
-parallelism_settings = [(1,4), (2,2), (4,1)]
+parallelism_settings = [(1, 4), (2, 2), (4, 1)]
 
 # The paths below should be with respect to the folder from which the tests are launched (FF_HOME/tests/inference)
 prompt_file = "../../inference/prompt/test.json"
@@ -69,7 +73,6 @@
 for model_name in all_models:
     for full_precision in (True, False):
         for parallelism_degrees in parallelism_settings:
-            
             tp, pp = parallelism_degrees
 
             # Tensor parallelism not supported by small Falcon model atm
@@ -79,14 +82,21 @@
             if tp > 2 and ("7b" in model_name or "6.7b" in model_name):
                 continue
 
-            if full_precision and ("falcon" in model_name or "starcoder" in model_name):
+            # Run Falcon only in full precision, Starcoder only in half precision
+            if (not full_precision and "falcon" in model_name) or (full_precision and "starcoder" in model_name):
                 continue
-            
+
             _, after_slash = model_name.rsplit("/", maxsplit=1)
-            filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp"
+            filename = (
+                "incr_dec-"
+                + "python-"
+                + after_slash.lower()
+                + ("-full_prec-" if full_precision else "-half_prec-")
+                + f"{tp}_tp_{pp}_pp"
+            )
             test_configs_file = "./" + filename + ".json"
-            output_file = os.path.join(output_folder, filename+".txt")
-            
+            output_file = os.path.join(output_folder, filename + ".txt")
+
             ff_init_configs["tensor_parallelism_degree"] = tp
             ff_init_configs["pipeline_parallelism_degree"] = pp
             ff_init_configs["llm_model"] = model_name
@@ -110,17 +120,23 @@
                 continue
 
             _, after_slash = big_model.rsplit("/", maxsplit=1)
-            filename = "spec_infer-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp"
+            filename = (
+                "spec_infer-"
+                + "python-"
+                + after_slash.lower()
+                + ("-full_prec-" if full_precision else "-half_prec-")
+                + f"{tp}_tp_{pp}_pp"
+            )
             test_configs_file = "./" + filename + ".json"
-            output_file = os.path.join(output_folder, filename+".txt")
-            
+            output_file = os.path.join(output_folder, filename + ".txt")
+
             ff_init_configs["tensor_parallelism_degree"] = tp
             ff_init_configs["pipeline_parallelism_degree"] = pp
             ff_init_configs["llm_model"] = big_model
             ff_init_configs["full_precision"] = full_precision
             ff_init_configs["output_file"] = output_file
             ff_init_configs["prompt"] = prompt_file
-            
+
             ssm_configs["ssms"][0]["ssm_model"] = small_model
             ssm_configs["ssms"][0]["full_precision"] = full_precision
             ff_init_configs.update(ssm_configs)
diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh
index c757dd5ee6..895b74c798 100755
--- a/tests/inference_tests.sh
+++ b/tests/inference_tests.sh
@@ -16,6 +16,12 @@ CPP_INFERENCE_TESTS=${CPP_INFERENCE_TESTS:-OFF}
 # Enable model parallelism tests in C++, if desired
 TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF}
 
+# Token to access private huggingface models (e.g. LLAMA-2)
+HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none}
+if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then
+    huggingface-cli login --token "$HUGGINGFACE_TOKEN"
+fi
+
 # Clean up before test (just in case)
 cleanup
 
@@ -24,7 +30,7 @@ pip3 install protobuf==3.20.3
 
 # Create test prompt file
 mkdir -p ../inference/prompt
-echo '["Give three tips for staying healthy."]' > ../inference/prompt/test.json
+echo '["Three tips for staying healthy are: "]' > ../inference/prompt/test.json
 
 # Create output folder
 mkdir -p ../inference/output
@@ -38,7 +44,7 @@ if [[ "$PYTHON_INFERENCE_TESTS" == "ON" ]]; then
 fi
 if [[ "$CPP_INFERENCE_TESTS" == "ON" ]]; then
     # Manually download the weights in both half and full precision
-    python3 ../inference/utils/download_hf_model.py "decapoda-research/llama-7b-hf" "JackFram/llama-160m-base" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b"
+    python3 ../inference/utils/download_hf_model.py "meta-llama/Llama-2-7b-hf" "JackFram/llama-160m" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b"
     echo "Running C++ inference tests..."
     ./inference/cpp_inference_tests.sh
 fi
diff --git a/tests/multinode_helpers/mpi_wrapper1.sh b/tests/multinode_helpers/mpi_wrapper1.sh
index 87d17d11a3..076fd2d66c 100755
--- a/tests/multinode_helpers/mpi_wrapper1.sh
+++ b/tests/multinode_helpers/mpi_wrapper1.sh
@@ -8,5 +8,5 @@ if [ -z "$GPUS" ]; then echo "GPUS variable is not defined, aborting tests"; exi
 
 # We need to wrap the instruction below in its own script because MPI throws an error if we try
 # to run "mpirun" more than once in the same script. Hence, we cannot simply call "mpirun" in the
-# multi_gpu_tests.sh script
+# training_tests.sh script
 mpirun -np "$NUM_NODES" "$FF_HOME"/tests/multinode_helpers/mpi_wrapper2.sh "$@"
diff --git a/tests/python_interface_test.sh b/tests/python_interface_test.sh
index 4f83918a49..5ce4d9803b 100755
--- a/tests/python_interface_test.sh
+++ b/tests/python_interface_test.sh
@@ -14,13 +14,13 @@ check_python_interface() {
 	
 	# Generate configs JSON files
 	test_params=$(jq -n --arg num_gpus "$GPUS" --arg memory_per_gpu "$FSIZE" --arg zero_copy_memory_per_node "$ZSIZE" --arg batch_size "$BATCHSIZE" --arg only_data_parallel "$ONLY_DATA_PARALLEL" '{"num_gpus":$num_gpus,"memory_per_gpu":$memory_per_gpu,"zero_copy_memory_per_node":$zero_copy_memory_per_node,"batch_size":$batch_size,"only_data_parallel":$only_data_parallel}')
-	mkdir -p /tmp/flexflow/multi_gpu_tests
-	echo "$test_params" > /tmp/flexflow/multi_gpu_tests/test_params.json
+	mkdir -p /tmp/flexflow/training_tests
+	echo "$test_params" > /tmp/flexflow/training_tests/test_params.json
 	
 	if [[ "$interpreter" == "python" ]]; then
 		EXE="python"
 		echo "Running a single-GPU Python test to check the Python interface (native python interpreter)"
-		$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
+		$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json
 	elif [[ "$interpreter" == "flexflow_python" ]]; then
 		if [[ "$installation_status" == "before-installation" ]]; then
 			EXE="$BUILD_FOLDER"/flexflow_python
diff --git a/tests/multi_gpu_tests.sh b/tests/training_tests.sh
similarity index 61%
rename from tests/multi_gpu_tests.sh
rename to tests/training_tests.sh
index 3a6f6467df..2d1f00883b 100755
--- a/tests/multi_gpu_tests.sh
+++ b/tests/training_tests.sh
@@ -33,57 +33,57 @@ test_params_5_epochs=$(echo "$test_params" | jq '. + {"epochs": 5}')
 test_params_40_epochs=$(echo "$test_params" | jq '. + {"epochs": 40}')
 test_params_5_epochs_no_batch_size=$(echo "$test_params_5_epochs" | jq 'del(.batch_size)')
 test_params_40_epochs_no_batch_size=$(echo "$test_params_40_epochs" | jq 'del(.batch_size)')
-mkdir -p /tmp/flexflow/multi_gpu_tests
-echo "$test_params" > /tmp/flexflow/multi_gpu_tests/test_params.json
-echo "$test_params_5_epochs" > /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json
-echo "$test_params_5_epochs_no_batch_size" > /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json
-echo "$test_params_40_epochs_no_batch_size" > /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json
+mkdir -p /tmp/flexflow/training_tests
+echo "$test_params" > /tmp/flexflow/training_tests/test_params.json
+echo "$test_params_5_epochs" > /tmp/flexflow/training_tests/test_params_5_epochs.json
+echo "$test_params_5_epochs_no_batch_size" > /tmp/flexflow/training_tests/test_params_5_epochs_no_batch_size.json
+echo "$test_params_40_epochs_no_batch_size" > /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json
 
 #Sequential model tests
-$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-#$EXE "$FF_HOME"/examples/python/keras/seq_reuters_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/seq_cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_nested.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json
+#$EXE "$FF_HOME"/examples/python/keras/seq_reuters_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/seq_cifar10_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_nested.py -config-file /tmp/flexflow/training_tests/test_params.json
 
 #Keras other
-$EXE "$FF_HOME"/examples/python/keras/callback.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/unary.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/reshape.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/elementwise_mul_broadcast.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/reduce_sum.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/identity_loss.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/elementwise_max_min.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/rsqrt.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/gather.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/regularizer.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/callback.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/unary.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/reshape.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/elementwise_mul_broadcast.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/reduce_sum.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/identity_loss.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/elementwise_max_min.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/rsqrt.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/gather.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/regularizer.py -config-file /tmp/flexflow/training_tests/test_params.json
 
 #Functional API
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat2.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_nested.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_alexnet.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat2.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn_concat.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_nested.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_cifar10_alexnet.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json
 
 #Python
-$EXE "$FF_HOME"/examples/python/native/print_layers.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json
-$EXE "$FF_HOME"/examples/python/native/split.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/native/alexnet.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json
-$EXE "$FF_HOME"/examples/python/native/mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json
-$EXE "$FF_HOME"/examples/python/native/mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json
-$EXE "$FF_HOME"/examples/python/native/cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json
-$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_attach.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json
-$EXE "$FF_HOME"/examples/python/native/mnist_mlp_attach.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json
+$EXE "$FF_HOME"/examples/python/native/print_layers.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs.json
+$EXE "$FF_HOME"/examples/python/native/split.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/native/alexnet.py -config-file /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json
+$EXE "$FF_HOME"/examples/python/native/mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs.json
+$EXE "$FF_HOME"/examples/python/native/mnist_cnn.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs.json
+$EXE "$FF_HOME"/examples/python/native/cifar10_cnn.py -config-file /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json
+$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_attach.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs_no_batch_size.json
+$EXE "$FF_HOME"/examples/python/native/mnist_mlp_attach.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs_no_batch_size.json
 
 #Possible crash
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_model.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_seq_model.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json
-$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json
+$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_model.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_seq_model.py -config-file /tmp/flexflow/training_tests/test_params.json
+$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_concat.py -config-file /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json