diff --git a/autogen/agentchat/contrib/multimodal_conversable_agent.py b/autogen/agentchat/contrib/multimodal_conversable_agent.py index 31fe1b7d5aae..1c56529a24dc 100644 --- a/autogen/agentchat/contrib/multimodal_conversable_agent.py +++ b/autogen/agentchat/contrib/multimodal_conversable_agent.py @@ -15,10 +15,8 @@ ) from autogen.code_utils import content_str -from ..._pydantic import model_dump - DEFAULT_LMM_SYS_MSG = """You are a helpful AI assistant.""" -DEFAULT_MODEL = "gpt-4-vision-preview" +DEFAULT_MODEL = "gpt-4-turbo" class MultimodalConversableAgent(ConversableAgent): @@ -116,13 +114,5 @@ def generate_oai_reply( messages_with_b64_img = message_formatter_pil_to_b64(self._oai_system_message + messages) - # TODO: #1143 handle token limit exceeded error - response = client.create( - context=messages[-1].pop("context", None), messages=messages_with_b64_img, agent=self.name - ) - - # TODO: line 301, line 271 is converting messages to dict. Can be removed after ChatCompletionMessage_to_dict is merged. - extracted_response = client.extract_text_or_completion_object(response)[0] - if not isinstance(extracted_response, str): - extracted_response = model_dump(extracted_response) - return True, extracted_response + extracted_response = self._generate_oai_reply_from_client(client, messages_with_b64_img, self.client_cache) + return (False, None) if extracted_response is None else (True, extracted_response) diff --git a/test/agentchat/contrib/test_lmm.py b/test/agentchat/contrib/test_lmm.py index f174855bfbeb..862119818ec3 100755 --- a/test/agentchat/contrib/test_lmm.py +++ b/test/agentchat/contrib/test_lmm.py @@ -10,10 +10,11 @@ from unittest.mock import MagicMock import pytest +from annotated_types import Annotated from conftest import MOCK_OPEN_AI_API_KEY import autogen -from autogen.agentchat.conversable_agent import ConversableAgent +from autogen import AssistantAgent, ConversableAgent, UserProxyAgent try: from autogen.agentchat.contrib.img_utils import get_pil_image @@ -23,6 +24,7 @@ else: skip = False +VISION_MODEL_NAME = "gpt-4-turbo" base64_encoded_image = ( "" @@ -44,7 +46,7 @@ def setUp(self): llm_config={ "timeout": 600, "seed": 42, - "config_list": [{"model": "gpt-4-vision-preview", "api_key": MOCK_OPEN_AI_API_KEY}], + "config_list": [{"model": VISION_MODEL_NAME, "api_key": MOCK_OPEN_AI_API_KEY}], }, ) @@ -144,5 +146,45 @@ def test_group_chat_with_lmm(): assert all(len(arr) <= max_round for arr in user_proxy._oai_messages.values()), "User proxy exceeded max rounds" +@pytest.mark.skipif(skip, reason="Dependency not installed") +def test_func_call_with_lmm(): + assistant = MultimodalConversableAgent( + name="Assistant", + system_message="Describe all the colors in the image.", + human_input_mode="NEVER", + max_consecutive_auto_reply=2, + llm_config={ + "timeout": 600, + "seed": 42, + "config_list": [{"model": VISION_MODEL_NAME, "api_key": MOCK_OPEN_AI_API_KEY}], + }, + ) + + coder = AssistantAgent( + name="Coder", + system_message="YOU MUST USE THE FUNCTION PROVIDED.", + llm_config={ + "timeout": 600, + "seed": 42, + "config_list": [{"model": VISION_MODEL_NAME, "api_key": MOCK_OPEN_AI_API_KEY}], + }, + human_input_mode="NEVER", + code_execution_config=False, + max_consecutive_auto_reply=2, + ) + + def count_colors(colors: list) -> int: + return len(colors) + + coder.register_for_llm(name="count_colors", description="Count colors.")(count_colors) + assistant.register_for_execution(name="count_colors")(count_colors) + + coder.initiate_chat( + assistant, clear_history=True, message=f"""How many colors here: """ + ) + + assert len(coder._oai_messages[assistant]) > 1, "Function call did not happen" + + if __name__ == "__main__": unittest.main()