Skip to content

Commit

Permalink
[Core] Add an environment variable which needs to be set explicitly t…
Browse files Browse the repository at this point in the history
…o allow BlockSpaceManagerV1 (vllm-project#9149)
  • Loading branch information
sroy745 authored Oct 10, 2024
1 parent a64e7b9 commit f3a507f
Show file tree
Hide file tree
Showing 14 changed files with 94 additions and 8 deletions.
18 changes: 12 additions & 6 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ steps:
- vllm/
- tests/basic_correctness/test_chunked_prefill
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py

- label: Core Test # 10min
mirror_hardwares: [amd]
Expand All @@ -88,7 +88,11 @@ steps:
- vllm/distributed
- tests/core
commands:
- pytest -v -s core
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
- pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py

- label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests"
Expand Down Expand Up @@ -185,7 +189,8 @@ steps:
- vllm/
- tests/prefix_caching
commands:
- pytest -v -s prefix_caching
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
- pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py

- label: Samplers Test # 36min
source_file_dependencies:
Expand All @@ -209,7 +214,8 @@ steps:
- tests/spec_decode
commands:
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py

- label: LoRA Test %N # 15min each
mirror_hardwares: [amd]
Expand Down Expand Up @@ -391,7 +397,7 @@ steps:
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
- TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser.add_argument("--enable-prefix-caching",
action='store_true',
help="Enable automatic prefix caching")
parser.add_argument('--use-v2-block-manager', action='store_true')
parser.add_argument('--use-v2-block-manager',
action='store_true',
default=EngineArgs.use_v2_block_manager)
parser.add_argument(
"--ray-workers-use-nsight",
action='store_true',
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/benchmark_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from transformers import PreTrainedTokenizerBase

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser

try:
Expand Down Expand Up @@ -177,6 +178,7 @@ def main(args):
help='enable prefix caching')
parser.add_argument('--use-v2-block-manager',
action='store_true',
default=EngineArgs.use_v2_block_manager,
help='Use BlockSpaceMangerV2')
parser.add_argument('--num-prompts',
type=int,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ def main(args: argparse.Namespace):
help="Maximum number of forward steps per scheduler call.")
parser.add_argument("--use-v2-block-manager",
action='store_true',
default=EngineArgs.use_v2_block_manager,
help="Enable block manager v2.")
parser.add_argument(
"--enable-prefix-caching",
Expand Down
8 changes: 7 additions & 1 deletion tests/basic_correctness/test_chunked_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,20 @@
import pytest

from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test
from ..utils import check_deprecated_block_manager_usage, multi_gpu_test

MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
]


@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/basic_correctness/test_chunked_prefill.py')


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
Expand Down
7 changes: 7 additions & 0 deletions tests/core/block/e2e/test_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@

import pytest

from tests.utils import check_deprecated_block_manager_usage
from vllm import SamplingParams

from .conftest import get_token_ids_from_llm_generator


@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/core/block/e2e/test_correctness.py')


@pytest.mark.parametrize(
"common_llm_kwargs",
[{
Expand Down
7 changes: 7 additions & 0 deletions tests/core/block/e2e/test_correctness_sliding_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pytest

from tests.utils import check_deprecated_block_manager_usage
from vllm import LLM, SamplingParams

from .conftest import get_text_from_llm_generator
Expand All @@ -12,6 +13,12 @@
BLOCK_SIZE = 16


@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/core/block/e2e/test_correctness_sliding_window.py')


@pytest.mark.parametrize(
"common_llm_kwargs",
[{
Expand Down
7 changes: 7 additions & 0 deletions tests/core/test_chunked_prefill_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from vllm.core.scheduler import Scheduler
from vllm.sequence import Logprob, SequenceGroup

from ..utils import check_deprecated_block_manager_usage
from .utils import create_dummy_prompt


Expand All @@ -27,6 +28,12 @@ def schedule_and_update_computed_tokens(scheduler):
return metas, out


@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/core/test_chunked_prefill_scheduler.py')


@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_simple(use_v2_block_manager: bool):
"""Verify basic scheduling works."""
Expand Down
7 changes: 7 additions & 0 deletions tests/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,18 @@
from vllm.lora.request import LoRARequest
from vllm.sequence import SequenceGroup, SequenceStatus

from ..utils import check_deprecated_block_manager_usage
from .utils import (append_new_token, append_new_token_seq_group,
create_dummy_prompt, get_sequence_groups,
schedule_and_update_computed_tokens)


@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
"tests/core/test_chunked_prefill_scheduler.py")


@pytest.mark.parametrize('use_v2_block_manager', [True, False])
def test_scheduler_add_seq_group(use_v2_block_manager: bool):
block_size = 4
Expand Down
7 changes: 7 additions & 0 deletions tests/prefix_caching/test_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pytest

from tests.kernels.utils import override_backend_env_variable
from tests.utils import check_deprecated_block_manager_usage
from vllm.block import PhysicalTokenBlock
from vllm.core.block_manager_v1 import CachedBlockAllocator
from vllm.utils import Device
Expand All @@ -18,6 +19,12 @@
]


@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/prefix_caching/test_prefix_caching.py')


@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("num_blocks", [16])
def test_block_allocator(
Expand Down
7 changes: 7 additions & 0 deletions tests/spec_decode/e2e/test_compatibility.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import pytest

from tests.utils import check_deprecated_block_manager_usage
from vllm import SamplingParams

from .conftest import get_output_from_llm_generator


@pytest.fixture(scope="module", autouse=True)
def check_deprecated_block_manager():
check_deprecated_block_manager_usage(
'tests/spec_decode/e2e/test_compatibility.py')


@pytest.mark.parametrize(
"common_llm_kwargs",
[{
Expand Down
9 changes: 9 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,3 +678,12 @@ def get_client_text_logprob_generations(
return [(text_generations, text,
(None if x.logprobs is None else x.logprobs.top_logprobs))
for completion in completions for x in completion.choices]


def check_deprecated_block_manager_usage(test_name: str):
assert envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1 is True, (
f"To allow the use of deprecated BlockSpaceManagerV1, set the "
f"environment variable VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. "
f"You can run the tests with: "
f"`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest {test_name}`" #noqa
)
12 changes: 12 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,18 @@ def _verify_args(self) -> None:
f"({self.num_scheduler_steps}) must be greater than or "
"equal to 1.")

if (not self.use_v2_block_manager \
and not envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1):
raise ValueError(
"The use of BlockSpaceManagerV1 is deprecated and will "
"be removed in a future release. Please switch to "
"BlockSpaceManagerV2 by setting --use-v2-block-manager to "
"True. If you wish to suppress this error temporarily, "
"you can set the environment variable "
"`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. If your use "
"case is not supported in BlockSpaceManagerV2, please "
"file an issue with detailed information.")

@property
def is_multi_step(self) -> bool:
return self.num_scheduler_steps > 1
Expand Down
6 changes: 6 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
VLLM_USE_TRITON_AWQ: bool = False
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
VLLM_SKIP_P2P_CHECK: bool = False
VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False


def get_default_cache_root():
Expand Down Expand Up @@ -434,6 +435,11 @@ def get_default_config_root():
# and trust the driver's peer-to-peer capability report.
"VLLM_SKIP_P2P_CHECK":
lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1",

# If set, allowing the use of deprecated block manager V1
"VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1":
lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0"
) == "1",
}

# end-env-vars-definition
Expand Down

0 comments on commit f3a507f

Please sign in to comment.