Skip to content

Commit

Permalink
[Core] Deprecating block manager v1 and make block manager v2 default (
Browse files Browse the repository at this point in the history
…vllm-project#8704)

Removing the block manager v1. This is the initial piece of prefix-caching-centric design. In order to achieve prefix-caching-centric design, we need to simplify the code path so that we only use v2 block manager (which has much higher performance on prefix caching).
  • Loading branch information
KuntaiDu authored Oct 17, 2024
1 parent 5eda21e commit 81ede99
Show file tree
Hide file tree
Showing 45 changed files with 206 additions and 2,109 deletions.
18 changes: 6 additions & 12 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ steps:
- vllm/
- tests/basic_correctness/test_chunked_prefill
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py

- label: Core Test # 10min
mirror_hardwares: [amd]
Expand All @@ -88,11 +88,7 @@ steps:
- vllm/distributed
- tests/core
commands:
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
- pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
- pytest -v -s core

- label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests"
Expand Down Expand Up @@ -192,8 +188,7 @@ steps:
- vllm/
- tests/prefix_caching
commands:
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
- pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
- pytest -v -s prefix_caching

- label: Samplers Test # 36min
source_file_dependencies:
Expand All @@ -217,8 +212,7 @@ steps:
- tests/spec_decode
commands:
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py

- label: LoRA Test %N # 15min each
mirror_hardwares: [amd]
Expand Down Expand Up @@ -405,7 +399,7 @@ steps:
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
Expand Down
4 changes: 0 additions & 4 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def main(args: argparse.Namespace):
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
Expand Down Expand Up @@ -221,9 +220,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser.add_argument("--enable-prefix-caching",
action='store_true',
help="Enable automatic prefix caching")
parser.add_argument('--use-v2-block-manager',
action='store_true',
default=EngineArgs.use_v2_block_manager)
parser.add_argument(
"--ray-workers-use-nsight",
action='store_true',
Expand Down
6 changes: 0 additions & 6 deletions benchmarks/benchmark_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
from transformers import PreTrainedTokenizerBase

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser

try:
Expand Down Expand Up @@ -134,7 +133,6 @@ def main(args):
tokenizer_mode='auto',
trust_remote_code=True,
enforce_eager=True,
use_v2_block_manager=args.use_v2_block_manager,
tensor_parallel_size=args.tensor_parallel_size,
enable_prefix_caching=args.enable_prefix_caching)

Expand Down Expand Up @@ -176,10 +174,6 @@ def main(args):
parser.add_argument('--enable-prefix-caching',
action='store_true',
help='enable prefix caching')
parser.add_argument('--use-v2-block-manager',
action='store_true',
default=EngineArgs.use_v2_block_manager,
help='Use BlockSpaceMangerV2')
parser.add_argument('--num-prompts',
type=int,
default=1,
Expand Down
11 changes: 1 addition & 10 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def run_vllm(
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
num_scheduler_steps: int = 1,
use_v2_block_manager: bool = False,
download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
disable_async_output_proc: bool = False,
Expand All @@ -113,7 +112,6 @@ def run_vllm(
distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
num_scheduler_steps=num_scheduler_steps,
use_v2_block_manager=use_v2_block_manager,
disable_async_output_proc=disable_async_output_proc,
)

Expand Down Expand Up @@ -176,7 +174,6 @@ async def run_vllm_async(
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
num_scheduler_steps: int = 1,
use_v2_block_manager: bool = False,
download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
disable_async_output_proc: bool = False,
Expand Down Expand Up @@ -204,7 +201,6 @@ async def run_vllm_async(
distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
num_scheduler_steps=num_scheduler_steps,
use_v2_block_manager=use_v2_block_manager,
disable_async_output_proc=disable_async_output_proc,
worker_use_ray=False,
disable_log_requests=True,
Expand Down Expand Up @@ -341,8 +337,7 @@ def main(args: argparse.Namespace):
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.num_scheduler_steps,
args.use_v2_block_manager, args.download_dir, args.load_format,
args.disable_async_output_proc
args.download_dir, args.load_format, args.disable_async_output_proc
]

if args.async_engine:
Expand Down Expand Up @@ -471,10 +466,6 @@ def main(args: argparse.Namespace):
type=int,
default=1,
help="Maximum number of forward steps per scheduler call.")
parser.add_argument("--use-v2-block-manager",
action='store_true',
default=EngineArgs.use_v2_block_manager,
help="Enable block manager v2.")
parser.add_argument(
"--enable-prefix-caching",
action='store_true',
Expand Down
4 changes: 0 additions & 4 deletions benchmarks/overheads/benchmark_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def main(args):
enforce_eager=True,
enable_prefix_caching=True,
tensor_parallel_size=args.tensor_parallel_size,
use_v2_block_manager=args.use_v2_block_manager,
)

sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
Expand Down Expand Up @@ -56,8 +55,5 @@ def main(args):
parser.add_argument('--enable-prefix-caching',
action='store_true',
help='enable prefix caching')
parser.add_argument('--use-v2-block-manager',
action='store_true',
help='Use BlockSpaceMangerV2')
args = parser.parse_args()
main(args)
3 changes: 0 additions & 3 deletions docs/source/models/spec_decode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ The following code configures vLLM in an offline mode to use speculative decodin
tensor_parallel_size=1,
speculative_model="facebook/opt-125m",
num_speculative_tokens=5,
use_v2_block_manager=True,
)
outputs = llm.generate(prompts, sampling_params)
Expand Down Expand Up @@ -104,7 +103,6 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
speculative_model="[ngram]",
num_speculative_tokens=5,
ngram_prompt_lookup_max=4,
use_v2_block_manager=True,
)
outputs = llm.generate(prompts, sampling_params)
Expand Down Expand Up @@ -135,7 +133,6 @@ For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-
tensor_parallel_size=4,
speculative_model="ibm-fms/llama3-70b-accelerator",
speculative_draft_tensor_parallel_size=1,
use_v2_block_manager=True,
)
outputs = llm.generate(prompts, sampling_params)
Expand Down
2 changes: 0 additions & 2 deletions examples/offline_inference_mlpspeculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ def time_generation(llm: LLM, prompts: List[str],
llm = LLM(
model="meta-llama/Llama-2-13b-chat-hf",
speculative_model="ibm-fms/llama-13b-accelerator",
# These are currently required for MLPSpeculator decoding
use_v2_block_manager=True,
)

print("With speculation")
Expand Down
11 changes: 1 addition & 10 deletions tests/basic_correctness/test_chunked_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,14 @@
import pytest

from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import check_deprecated_block_manager_usage, multi_gpu_test
from ..utils import multi_gpu_test

MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
]


@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/basic_correctness/test_chunked_prefill.py')


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
Expand Down Expand Up @@ -197,7 +191,6 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32])
@pytest.mark.parametrize("use_v2_block_manager", [False, True])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
Expand All @@ -206,7 +199,6 @@ def test_with_prefix_caching(
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
use_v2_block_manager: bool,
tensor_parallel_size: int,
) -> None:
"""
Expand Down Expand Up @@ -234,7 +226,6 @@ def test_with_prefix_caching(
enable_chunked_prefill=True,
enable_prefix_caching=enable,
tensor_parallel_size=tensor_parallel_size,
use_v2_block_manager=use_v2_block_manager,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
) as vllm_model:
Expand Down
Loading

0 comments on commit 81ede99

Please sign in to comment.