Skip to content

Commit

Permalink
sync
Browse files Browse the repository at this point in the history
  • Loading branch information
alexm-neuralmagic committed Dec 3, 2024
1 parent db28436 commit 4a5aecb
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 7 deletions.
48 changes: 41 additions & 7 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
import random
from transformers import AutoTokenizer

from vllm import LLM, SamplingParams
Expand All @@ -23,7 +24,9 @@ def run_llava(question: str, modality: str):

prompt = f"USER: <image>\n{question}\nASSISTANT:"

llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
llm = LLM(model="llava-hf/llava-1.5-7b-hf",
max_model_len=4096,
mm_cache_preprocessor=args.mm_cache_preprocessor)
stop_token_ids = None
return llm, prompt, stop_token_ids

Expand Down Expand Up @@ -507,12 +510,29 @@ def main(args):

else:
# Batch inference
inputs = [{
"prompt": prompt,
"multi_modal_data": {
modality: data
},
} for _ in range(args.num_prompts)]
if args.image_repeat_ratio is not None:
assert (args.image_repeat_ratio <= 1.0
and args.image_repeat_ratio >= 0)
no_yes = [0, 1]
probs = [1.0 - args.image_repeat_ratio, args.image_repeat_ratio]

inputs = []
cur_image = data
for i in range(args.num_prompts):
if args.image_repeat_ratio is not None:
res = random.choices(no_yes, probs)[0]
if res == 0:
# No repeat => Modify one pixel
cur_image = cur_image.copy()
new_val = (i // 256 // 256, i // 256, i % 256)
cur_image.putpixel((0, 0), new_val)

inputs.append({
"prompt": prompt,
"multi_modal_data": {
modality: cur_image
}
})

import time
start_time = time.time()
Expand Down Expand Up @@ -548,5 +568,19 @@ def main(args):
type=int,
default=16,
help='Number of frames to extract from the video.')

parser.add_argument(
'--image-repeat-ratio',
type=float,
default=None,
help=
'Simulates the hit-ratio for multi-modal preprocessor cache (if enabled)'

Check failure on line 577 in examples/offline_inference_vision_language.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

examples/offline_inference_vision_language.py:577:81: E501 Line too long (81 > 80)
)

parser.add_argument(
'--mm-cache-preprocessor',
action='store_true',
help='If True, enable caching of multi-modal preprocessor/mapper.')

args = parser.parse_args()
main(args)
4 changes: 4 additions & 0 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
type=json.loads,
help=('Overrides for the multimodal input mapping/processing, '
'e.g., image processor. For example: {"num_crops": 4}.'))
parser.add_argument(
'--mm-cache-preprocessor',
action='store_true',
help='If True, enable caching of multi-modal preprocessor/mapper.')

# LoRA related configs
parser.add_argument('--enable-lora',
Expand Down

0 comments on commit 4a5aecb

Please sign in to comment.