diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 7bc43242b717e..6d0495fdd4054 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -19,6 +19,159 @@ # Unless specified, these settings have been tested to work on a single L4. +# Aria +def run_aria(question: str, modality: str): + assert modality == "image" + model_name = "rhymes-ai/Aria" + + llm = LLM(model=model_name, + tokenizer_mode="slow", + trust_remote_code=True, + dtype="bfloat16", + mm_cache_preprocessor=args.mm_cache_preprocessor) + + prompt = (f"<|im_start|>user\n<|img|>\n{question}" + "<|im_end|>\n<|im_start|>assistant\n") + + stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] + return llm, prompt, stop_token_ids + + +# BLIP-2 +def run_blip2(question: str, modality: str): + assert modality == "image" + + # BLIP-2 prompt format is inaccurate on HuggingFace model repository. + # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa + prompt = f"Question: {question} Answer:" + llm = LLM(model="Salesforce/blip2-opt-2.7b", + mm_cache_preprocessor=args.mm_cache_preprocessor) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# Chameleon +def run_chameleon(question: str, modality: str): + assert modality == "image" + + prompt = f"{question}" + llm = LLM(model="facebook/chameleon-7b", + max_model_len=4096, + mm_cache_preprocessor=args.mm_cache_preprocessor) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# Fuyu +def run_fuyu(question: str, modality: str): + assert modality == "image" + + prompt = f"{question}\n" + llm = LLM(model="adept/fuyu-8b", + max_model_len=2048, + max_num_seqs=2, + mm_cache_preprocessor=args.mm_cache_preprocessor) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# GLM-4v +def run_glm4v(question: str, modality: str): + assert modality == "image" + model_name = "THUDM/glm-4v-9b" + + llm = LLM(model=model_name, + max_model_len=2048, + max_num_seqs=2, + trust_remote_code=True, + enforce_eager=True, + mm_cache_preprocessor=args.mm_cache_preprocessor) + prompt = question + stop_token_ids = [151329, 151336, 151338] + return llm, prompt, stop_token_ids + + +# H2OVL-Mississippi +def run_h2ovl(question: str, modality: str): + assert modality == "image" + + model_name = "h2oai/h2ovl-mississippi-2b" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + mm_cache_preprocessor=args.mm_cache_preprocessor, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + messages = [{'role': 'user', 'content': f"\n{question}"}] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for H2OVL-Mississippi + # https://huggingface.co/h2oai/h2ovl-mississippi-2b + stop_token_ids = [tokenizer.eos_token_id] + return llm, prompt, stop_token_ids + + +# Idefics3-8B-Llama3 +def run_idefics3(question: str, modality: str): + assert modality == "image" + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True, + # if you are running out of memory, you can reduce the "longest_edge". + # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations + mm_processor_kwargs={ + "size": { + "longest_edge": 3 * 364 + }, + }, + mm_cache_preprocessor=args.mm_cache_preprocessor, + ) + prompt = ( + f"<|begin_of_text|>User:{question}\nAssistant:" + ) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# InternVL +def run_internvl(question: str, modality: str): + assert modality == "image" + + model_name = "OpenGVLab/InternVL2-2B" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + mm_cache_preprocessor=args.mm_cache_preprocessor, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + messages = [{'role': 'user', 'content': f"\n{question}"}] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for InternVL + # models variants may have different stop tokens + # please refer to the model card for the correct "stop words": + # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py + stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + return llm, prompt, stop_token_ids + + # LLaVA-1.5 def run_llava(question: str, modality: str): assert modality == "image" @@ -75,83 +228,20 @@ def run_llava_onevision(question: str, modality: str): return llm, prompt, stop_token_ids -# Fuyu -def run_fuyu(question: str, modality: str): - assert modality == "image" - - prompt = f"{question}\n" - llm = LLM(model="adept/fuyu-8b", - max_model_len=2048, - max_num_seqs=2, - mm_cache_preprocessor=args.mm_cache_preprocessor) - stop_token_ids = None - return llm, prompt, stop_token_ids - - -# Phi-3-Vision -def run_phi3v(question: str, modality: str): +# Mantis +def run_mantis(question: str, modality: str): assert modality == "image" - prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" + llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501 + prompt = llama3_template.format(f"{question}\n") - # num_crops is an override kwarg to the multimodal image processor; - # For some models, e.g., Phi-3.5-vision-instruct, it is recommended - # to use 16 for single frame scenarios, and 4 for multi-frame. - # - # Generally speaking, a larger value for num_crops results in more - # tokens per image instance, because it may scale the image more in - # the image preprocessing. Some references in the model docs and the - # formula for image tokens after the preprocessing - # transform can be found below. - # - # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally - # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - trust_remote_code=True, + model="TIGER-Lab/Mantis-8B-siglip-llama3", max_model_len=4096, - max_num_seqs=2, - # Note - mm_processor_kwargs can also be passed to generate/chat calls - mm_processor_kwargs={"num_crops": 16}, + hf_overrides={"architectures": ["MantisForConditionalGeneration"]}, mm_cache_preprocessor=args.mm_cache_preprocessor, ) - stop_token_ids = None - return llm, prompt, stop_token_ids - - -# PaliGemma -def run_paligemma(question: str, modality: str): - assert modality == "image" - - # PaliGemma has special prompt format for VQA - prompt = "caption en" - llm = LLM(model="google/paligemma-3b-mix-224", - mm_cache_preprocessor=args.mm_cache_preprocessor) - stop_token_ids = None - return llm, prompt, stop_token_ids - - -# PaliGemma 2 -def run_paligemma2(question: str, modality: str): - assert modality == "image" - - # PaliGemma 2 has special prompt format for VQA - prompt = "caption en" - llm = LLM(model="google/paligemma2-3b-ft-docci-448", - mm_cache_preprocessor=args.mm_cache_preprocessor) - stop_token_ids = None - return llm, prompt, stop_token_ids - - -# Chameleon -def run_chameleon(question: str, modality: str): - assert modality == "image" - - prompt = f"{question}" - llm = LLM(model="facebook/chameleon-7b", - max_model_len=4096, - mm_cache_preprocessor=args.mm_cache_preprocessor) - stop_token_ids = None + stop_token_ids = [128009] return llm, prompt, stop_token_ids @@ -199,58 +289,45 @@ def run_minicpmv(question: str, modality: str): return llm, prompt, stop_token_ids -# H2OVL-Mississippi -def run_h2ovl(question: str, modality: str): +# LLama 3.2 +def run_mllama(question: str, modality: str): assert modality == "image" - model_name = "h2oai/h2ovl-mississippi-2b" + model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" + # Note: The default setting of max_num_seqs (256) and + # max_model_len (131072) for this model may cause OOM. + # You may lower either to run this example on lower-end GPUs. + + # The configuration below has been confirmed to launch on a single L40 GPU. llm = LLM( model=model_name, - trust_remote_code=True, - max_model_len=8192, + max_model_len=4096, + max_num_seqs=16, + enforce_eager=True, mm_cache_preprocessor=args.mm_cache_preprocessor, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [{'role': 'user', 'content': f"\n{question}"}] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) - - # Stop tokens for H2OVL-Mississippi - # https://huggingface.co/h2oai/h2ovl-mississippi-2b - stop_token_ids = [tokenizer.eos_token_id] + prompt = f"<|image|><|begin_of_text|>{question}" + stop_token_ids = None return llm, prompt, stop_token_ids -# InternVL -def run_internvl(question: str, modality: str): +# Molmo +def run_molmo(question, modality): assert modality == "image" - model_name = "OpenGVLab/InternVL2-2B" + model_name = "allenai/Molmo-7B-D-0924" llm = LLM( model=model_name, trust_remote_code=True, - max_model_len=4096, + dtype="bfloat16", mm_cache_preprocessor=args.mm_cache_preprocessor, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [{'role': 'user', 'content': f"\n{question}"}] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) - - # Stop tokens for InternVL - # models variants may have different stop tokens - # please refer to the model card for the correct "stop words": - # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py - stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] - stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + prompt = question + stop_token_ids = None return llm, prompt, stop_token_ids @@ -279,58 +356,57 @@ def run_nvlm_d(question: str, modality: str): return llm, prompt, stop_token_ids -# BLIP-2 -def run_blip2(question: str, modality: str): +# PaliGemma +def run_paligemma(question: str, modality: str): assert modality == "image" - # BLIP-2 prompt format is inaccurate on HuggingFace model repository. - # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa - prompt = f"Question: {question} Answer:" - llm = LLM(model="Salesforce/blip2-opt-2.7b", + # PaliGemma has special prompt format for VQA + prompt = "caption en" + llm = LLM(model="google/paligemma-3b-mix-224", mm_cache_preprocessor=args.mm_cache_preprocessor) stop_token_ids = None return llm, prompt, stop_token_ids -# Qwen -def run_qwen_vl(question: str, modality: str): +# PaliGemma 2 +def run_paligemma2(question: str, modality: str): assert modality == "image" - llm = LLM( - model="Qwen/Qwen-VL", - trust_remote_code=True, - max_model_len=1024, - max_num_seqs=2, - mm_cache_preprocessor=args.mm_cache_preprocessor, - ) - - prompt = f"{question}Picture 1: \n" + # PaliGemma 2 has special prompt format for VQA + prompt = "caption en" + llm = LLM(model="google/paligemma2-3b-ft-docci-448", + mm_cache_preprocessor=args.mm_cache_preprocessor) stop_token_ids = None return llm, prompt, stop_token_ids -# Qwen2-VL -def run_qwen2_vl(question: str, modality: str): +# Phi-3-Vision +def run_phi3v(question: str, modality: str): assert modality == "image" - model_name = "Qwen/Qwen2-VL-7B-Instruct" + prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" + # num_crops is an override kwarg to the multimodal image processor; + # For some models, e.g., Phi-3.5-vision-instruct, it is recommended + # to use 16 for single frame scenarios, and 4 for multi-frame. + # + # Generally speaking, a larger value for num_crops results in more + # tokens per image instance, because it may scale the image more in + # the image preprocessing. Some references in the model docs and the + # formula for image tokens after the preprocessing + # transform can be found below. + # + # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally + # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 llm = LLM( - model=model_name, + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, max_model_len=4096, - max_num_seqs=5, + max_num_seqs=2, # Note - mm_processor_kwargs can also be passed to generate/chat calls - mm_processor_kwargs={ - "min_pixels": 28 * 28, - "max_pixels": 1280 * 28 * 28, - }, + mm_processor_kwargs={"num_crops": 16}, mm_cache_preprocessor=args.mm_cache_preprocessor, ) - - prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" - f"{question}<|im_end|>\n" - "<|im_start|>assistant\n") stop_token_ids = None return llm, prompt, stop_token_ids @@ -352,149 +428,73 @@ def run_pixtral_hf(question: str, modality: str): return llm, prompt, stop_token_ids -# LLama 3.2 -def run_mllama(question: str, modality: str): - assert modality == "image" - - model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" - - # Note: The default setting of max_num_seqs (256) and - # max_model_len (131072) for this model may cause OOM. - # You may lower either to run this example on lower-end GPUs. - - # The configuration below has been confirmed to launch on a single L40 GPU. - llm = LLM( - model=model_name, - max_model_len=4096, - max_num_seqs=16, - enforce_eager=True, - mm_cache_preprocessor=args.mm_cache_preprocessor, - ) - - prompt = f"<|image|><|begin_of_text|>{question}" - stop_token_ids = None - return llm, prompt, stop_token_ids - - -# Molmo -def run_molmo(question, modality): +# Qwen +def run_qwen_vl(question: str, modality: str): assert modality == "image" - model_name = "allenai/Molmo-7B-D-0924" - llm = LLM( - model=model_name, + model="Qwen/Qwen-VL", trust_remote_code=True, - dtype="bfloat16", + max_model_len=1024, + max_num_seqs=2, mm_cache_preprocessor=args.mm_cache_preprocessor, ) - prompt = question + prompt = f"{question}Picture 1: \n" stop_token_ids = None return llm, prompt, stop_token_ids -# GLM-4v -def run_glm4v(question: str, modality: str): +# Qwen2-VL +def run_qwen2_vl(question: str, modality: str): assert modality == "image" - model_name = "THUDM/glm-4v-9b" - llm = LLM(model=model_name, - max_model_len=2048, - max_num_seqs=2, - trust_remote_code=True, - enforce_eager=True, - mm_cache_preprocessor=args.mm_cache_preprocessor) - prompt = question - stop_token_ids = [151329, 151336, 151338] - return llm, prompt, stop_token_ids - - -# Idefics3-8B-Llama3 -def run_idefics3(question: str, modality: str): - assert modality == "image" - model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + model_name = "Qwen/Qwen2-VL-7B-Instruct" llm = LLM( model=model_name, - max_model_len=8192, - max_num_seqs=2, - enforce_eager=True, - # if you are running out of memory, you can reduce the "longest_edge". - # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations + max_model_len=4096, + max_num_seqs=5, + # Note - mm_processor_kwargs can also be passed to generate/chat calls mm_processor_kwargs={ - "size": { - "longest_edge": 3 * 364 - }, + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, }, mm_cache_preprocessor=args.mm_cache_preprocessor, ) - prompt = ( - f"<|begin_of_text|>User:{question}\nAssistant:" - ) - stop_token_ids = None - return llm, prompt, stop_token_ids - -# Aria -def run_aria(question: str, modality: str): - assert modality == "image" - model_name = "rhymes-ai/Aria" - - llm = LLM(model=model_name, - tokenizer_mode="slow", - trust_remote_code=True, - dtype="bfloat16", - mm_cache_preprocessor=args.mm_cache_preprocessor) - - prompt = (f"<|im_start|>user\n<|img|>\n{question}" - "<|im_end|>\n<|im_start|>assistant\n") - - stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] - return llm, prompt, stop_token_ids - - -# Mantis -def run_mantis(question: str, modality: str): - assert modality == "image" - - llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501 - prompt = llama3_template.format(f"{question}\n") - - llm = LLM( - model="TIGER-Lab/Mantis-8B-siglip-llama3", - max_model_len=4096, - hf_overrides={"architectures": ["MantisForConditionalGeneration"]}, - mm_cache_preprocessor=args.mm_cache_preprocessor, - ) - stop_token_ids = [128009] + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None return llm, prompt, stop_token_ids model_example_map = { + "aria": run_aria, + "blip-2": run_blip2, + "chameleon": run_chameleon, + "fuyu": run_fuyu, + "glm4v": run_glm4v, + "h2ovl_chat": run_h2ovl, + "idefics3": run_idefics3, + "internvl_chat": run_internvl, "llava": run_llava, "llava-next": run_llava_next, "llava-next-video": run_llava_next_video, "llava-onevision": run_llava_onevision, - "fuyu": run_fuyu, - "phi3_v": run_phi3v, - "paligemma": run_paligemma, - "paligemma2": run_paligemma2, - "chameleon": run_chameleon, + "mantis": run_mantis, "minicpmv": run_minicpmv, - "blip-2": run_blip2, - "h2ovl_chat": run_h2ovl, - "internvl_chat": run_internvl, + "mllama": run_mllama, + "molmo": run_molmo, "NVLM_D": run_nvlm_d, + "paligemma": run_paligemma, + "paligemma2": run_paligemma2, + "phi3_v": run_phi3v, + "pixtral_hf": run_pixtral_hf, "qwen_vl": run_qwen_vl, "qwen2_vl": run_qwen2_vl, - "pixtral_hf": run_pixtral_hf, - "mllama": run_mllama, - "molmo": run_molmo, - "glm4v": run_glm4v, - "idefics3": run_idefics3, - "aria": run_aria, - "mantis": run_mantis, } diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index 928bbef54eab7..6af8d7768e75d 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -33,78 +33,23 @@ class ModelRequestData(NamedTuple): # Unless specified, these settings have been tested to work on a single L4. -def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: - model_name = "Qwen/Qwen-VL-Chat" - llm = LLM( - model=model_name, - trust_remote_code=True, - max_model_len=1024, - max_num_seqs=2, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - placeholders = "".join(f"Picture {i}: \n" - for i, _ in enumerate(image_urls, start=1)) - - # This model does not have a chat_template attribute on its tokenizer, - # so we need to explicitly pass it. We use ChatML since it's used in the - # generation utils of the model: - # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265 - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - - # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating - chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501 - - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True, - chat_template=chat_template) - - stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] - stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - return ModelRequestData( - llm=llm, - prompt=prompt, - stop_token_ids=stop_token_ids, - image_data=[fetch_image(url) for url in image_urls], - chat_template=chat_template, - ) - - -def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: - # num_crops is an override kwarg to the multimodal image processor; - # For some models, e.g., Phi-3.5-vision-instruct, it is recommended - # to use 16 for single frame scenarios, and 4 for multi-frame. - # - # Generally speaking, a larger value for num_crops results in more - # tokens per image instance, because it may scale the image more in - # the image preprocessing. Some references in the model docs and the - # formula for image tokens after the preprocessing - # transform can be found below. - # - # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally - # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 - llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - trust_remote_code=True, - max_model_len=4096, - max_num_seqs=2, - limit_mm_per_prompt={"image": len(image_urls)}, - mm_processor_kwargs={"num_crops": 4}, - ) - placeholders = "\n".join(f"<|image_{i}|>" - for i, _ in enumerate(image_urls, start=1)) - prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" - stop_token_ids = None - +def load_aria(question, image_urls: List[str]) -> ModelRequestData: + model_name = "rhymes-ai/Aria" + llm = LLM(model=model_name, + tokenizer_mode="slow", + trust_remote_code=True, + dtype="bfloat16", + limit_mm_per_prompt={"image": len(image_urls)}) + placeholders = "<|img|>\n" * len(image_urls) + prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] return ModelRequestData( llm=llm, prompt=prompt, stop_token_ids=stop_token_ids, image_data=[fetch_image(url) for url in image_urls], - chat_template=None, - ) + chat_template=None) def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: @@ -141,6 +86,37 @@ def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: ) +def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + # The configuration below has been confirmed to launch on a single L40 GPU. + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + # if you are running out of memory, you can reduce the "longest_edge". + # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations + mm_processor_kwargs={ + "size": { + "longest_edge": 2 * 364 + }, + }, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|begin_of_text|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: model_name = "OpenGVLab/InternVL2-2B" @@ -178,6 +154,28 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: ) +def load_mllama(question, image_urls: List[str]) -> ModelRequestData: + model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" + + # The configuration below has been confirmed to launch on a single L40 GPU. + llm = LLM( + model=model_name, + max_model_len=4096, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + prompt = f"<|image|><|image|><|begin_of_text|>{question}" + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + def load_nvlm_d(question: str, image_urls: List[str]): model_name = "nvidia/NVLM-D-72B" @@ -211,6 +209,80 @@ def load_nvlm_d(question: str, image_urls: List[str]): ) +def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: + # num_crops is an override kwarg to the multimodal image processor; + # For some models, e.g., Phi-3.5-vision-instruct, it is recommended + # to use 16 for single frame scenarios, and 4 for multi-frame. + # + # Generally speaking, a larger value for num_crops results in more + # tokens per image instance, because it may scale the image more in + # the image preprocessing. Some references in the model docs and the + # formula for image tokens after the preprocessing + # transform can be found below. + # + # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally + # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 + llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, + max_model_len=4096, + max_num_seqs=2, + limit_mm_per_prompt={"image": len(image_urls)}, + mm_processor_kwargs={"num_crops": 4}, + ) + placeholders = "\n".join(f"<|image_{i}|>" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" + stop_token_ids = None + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + +def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: + model_name = "Qwen/Qwen-VL-Chat" + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=1024, + max_num_seqs=2, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + placeholders = "".join(f"Picture {i}: \n" + for i, _ in enumerate(image_urls, start=1)) + + # This model does not have a chat_template attribute on its tokenizer, + # so we need to explicitly pass it. We use ChatML since it's used in the + # generation utils of the model: + # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265 + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + + # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating + chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501 + + messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True, + chat_template=chat_template) + + stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=chat_template, + ) + + def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: try: from qwen_vl_utils import process_vision_info @@ -268,88 +340,16 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ) -def load_mllama(question, image_urls: List[str]) -> ModelRequestData: - model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" - - # The configuration below has been confirmed to launch on a single L40 GPU. - llm = LLM( - model=model_name, - max_model_len=4096, - max_num_seqs=16, - enforce_eager=True, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - - prompt = f"<|image|><|image|><|begin_of_text|>{question}" - return ModelRequestData( - llm=llm, - prompt=prompt, - stop_token_ids=None, - image_data=[fetch_image(url) for url in image_urls], - chat_template=None, - ) - - -def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: - model_name = "HuggingFaceM4/Idefics3-8B-Llama3" - - # The configuration below has been confirmed to launch on a single L40 GPU. - llm = LLM( - model=model_name, - max_model_len=8192, - max_num_seqs=16, - enforce_eager=True, - limit_mm_per_prompt={"image": len(image_urls)}, - # if you are running out of memory, you can reduce the "longest_edge". - # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations - mm_processor_kwargs={ - "size": { - "longest_edge": 2 * 364 - }, - }, - ) - - placeholders = "\n".join(f"Image-{i}: \n" - for i, _ in enumerate(image_urls, start=1)) - prompt = f"<|begin_of_text|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 - return ModelRequestData( - llm=llm, - prompt=prompt, - stop_token_ids=None, - image_data=[fetch_image(url) for url in image_urls], - chat_template=None, - ) - - -def load_aria(question, image_urls: List[str]) -> ModelRequestData: - model_name = "rhymes-ai/Aria" - llm = LLM(model=model_name, - tokenizer_mode="slow", - trust_remote_code=True, - dtype="bfloat16", - limit_mm_per_prompt={"image": len(image_urls)}) - placeholders = "<|img|>\n" * len(image_urls) - prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" - "<|im_start|>assistant\n") - stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] - return ModelRequestData( - llm=llm, - prompt=prompt, - stop_token_ids=stop_token_ids, - image_data=[fetch_image(url) for url in image_urls], - chat_template=None) - - model_example_map = { - "phi3_v": load_phi3v, + "aria": load_aria, "h2ovl_chat": load_h2onvl, + "idefics3": load_idefics3, "internvl_chat": load_internvl, + "mllama": load_mllama, "NVLM_D": load_nvlm_d, - "qwen2_vl": load_qwen2_vl, + "phi3_v": load_phi3v, "qwen_vl_chat": load_qwenvl_chat, - "mllama": load_mllama, - "idefics3": load_idefics3, - "aria": load_aria, + "qwen2_vl": load_qwen2_vl, }