diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 6d0495fdd4054..64c7b93f4a71b 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -28,7 +28,7 @@ def run_aria(question: str, modality: str): tokenizer_mode="slow", trust_remote_code=True, dtype="bfloat16", - mm_cache_preprocessor=args.mm_cache_preprocessor) + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) prompt = (f"<|im_start|>user\n<|img|>\n{question}" "<|im_end|>\n<|im_start|>assistant\n") @@ -45,7 +45,7 @@ def run_blip2(question: str, modality: str): # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa prompt = f"Question: {question} Answer:" llm = LLM(model="Salesforce/blip2-opt-2.7b", - mm_cache_preprocessor=args.mm_cache_preprocessor) + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids @@ -57,7 +57,7 @@ def run_chameleon(question: str, modality: str): prompt = f"{question}" llm = LLM(model="facebook/chameleon-7b", max_model_len=4096, - mm_cache_preprocessor=args.mm_cache_preprocessor) + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids @@ -70,7 +70,7 @@ def run_fuyu(question: str, modality: str): llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2, - mm_cache_preprocessor=args.mm_cache_preprocessor) + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids @@ -85,7 +85,7 @@ def run_glm4v(question: str, modality: str): max_num_seqs=2, trust_remote_code=True, enforce_eager=True, - mm_cache_preprocessor=args.mm_cache_preprocessor) + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) prompt = question stop_token_ids = [151329, 151336, 151338] return llm, prompt, stop_token_ids @@ -101,7 +101,7 @@ def run_h2ovl(question: str, modality: str): model=model_name, trust_remote_code=True, max_model_len=8192, - mm_cache_preprocessor=args.mm_cache_preprocessor, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) tokenizer = AutoTokenizer.from_pretrained(model_name, @@ -134,7 +134,7 @@ def run_idefics3(question: str, modality: str): "longest_edge": 3 * 364 }, }, - mm_cache_preprocessor=args.mm_cache_preprocessor, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) prompt = ( f"<|begin_of_text|>User:{question}\nAssistant:" @@ -153,7 +153,7 @@ def run_internvl(question: str, modality: str): model=model_name, trust_remote_code=True, max_model_len=4096, - mm_cache_preprocessor=args.mm_cache_preprocessor, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) tokenizer = AutoTokenizer.from_pretrained(model_name, @@ -180,7 +180,7 @@ def run_llava(question: str, modality: str): llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096, - mm_cache_preprocessor=args.mm_cache_preprocessor) + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids @@ -192,7 +192,7 @@ def run_llava_next(question: str, modality: str): prompt = f"[INST] \n{question} [/INST]" llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192, - mm_cache_preprocessor=args.mm_cache_preprocessor) + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids @@ -205,7 +205,7 @@ def run_llava_next_video(question: str, modality: str): prompt = f"USER: