Merge pull request #171 from ag2ai/openaitestmodel

Update of OpenAI (and Azure) models and versions
ag2ai · Dec 15, 2024 · 1c6445d · 1c6445d
2 parents 5e7758b + 07759f3
commit 1c6445d
Show file tree

Hide file tree

Showing 28 changed files with 114 additions and 134 deletions.
diff --git a/test/agentchat/contrib/agent_eval/test_agent_eval.py b/test/agentchat/contrib/agent_eval/test_agent_eval.py
@@ -38,13 +38,12 @@ def remove_ground_truth(test_case: str):
         filter_dict={
             "api_type": ["openai"],
             "model": [
+                "gpt-4o-mini",
+                "gpt-4o",
                 "gpt-4-turbo",
                 "gpt-4-turbo-preview",
                 "gpt-4-0125-preview",
                 "gpt-4-1106-preview",
-                "gpt-3.5-turbo",
-                "gpt-3.5-turbo-0125",
-                "gpt-3.5-turbo-1106",
             ],
         },
     )

diff --git a/test/agentchat/contrib/capabilities/chat_with_teachable_agent.py b/test/agentchat/contrib/capabilities/chat_with_teachable_agent.py
@@ -17,12 +17,8 @@
 from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST  # noqa: E402
 
 # Specify the model to use. GPT-3.5 is less reliable than GPT-4 at learning from user input.
-filter_dict = {"model": ["gpt-4-0125-preview"]}
-# filter_dict = {"model": ["gpt-3.5-turbo-1106"]}
-# filter_dict = {"model": ["gpt-4-0613"]}
-# filter_dict = {"model": ["gpt-3.5-turbo"]}
-# filter_dict = {"model": ["gpt-4"]}
-# filter_dict = {"model": ["gpt-35-turbo-16k", "gpt-3.5-turbo-16k"]}
+filter_dict = {"model": ["gpt-4o-mini"]}
+# filter_dict = {"model": ["gpt-4-0125-preview"]}
 
 
 def create_teachable_agent(reset_db=False):

diff --git a/test/agentchat/contrib/capabilities/test_image_generation_capability.py b/test/agentchat/contrib/capabilities/test_image_generation_capability.py
@@ -32,7 +32,7 @@
 sys.path.append(os.path.join(os.path.dirname(__file__), "../.."))
 from conftest import MOCK_OPEN_AI_API_KEY, skip_openai  # noqa: E402
 
-filter_dict = {"model": ["gpt-35-turbo-16k", "gpt-3.5-turbo-16k"]}
+filter_dict = {"model": ["gpt-4o-mini"]}
 
 RESOLUTIONS = ["256x256", "512x512", "1024x1024"]
 QUALITIES = ["standard", "hd"]
@@ -67,21 +67,21 @@ def api_key():
 
 @pytest.fixture
 def dalle_config() -> Dict[str, Any]:
-    config_list = openai_utils.config_list_from_models(model_list=["dall-e-2"], exclude="aoai")
+    config_list = openai_utils.config_list_from_models(model_list=["dall-e-3"], exclude="aoai")
     if not config_list:
-        config_list = [{"model": "dall-e-2", "api_key": api_key()}]
+        config_list = [{"model": "dall-e-3", "api_key": api_key()}]
     return {"config_list": config_list, "timeout": 120, "cache_seed": None}
 
 
 @pytest.fixture
-def gpt3_config() -> Dict[str, Any]:
+def gpt4_config() -> Dict[str, Any]:
     config_list = [
         {
-            "model": "gpt-35-turbo-16k",
+            "model": "gpt-4o-mini",
             "api_key": api_key(),
         },
         {
-            "model": "gpt-3.5-turbo-16k",
+            "model": "gpt-4o",
             "api_key": api_key(),
         },
     ]

diff --git a/test/agentchat/contrib/capabilities/test_teachable_agent.py b/test/agentchat/contrib/capabilities/test_teachable_agent.py
@@ -31,10 +31,8 @@
 # Specify the model to use by uncommenting one of the following lines.
 # filter_dict={"model": ["gpt-4-1106-preview"]}
 # filter_dict={"model": ["gpt-4-0613"]}
-# filter_dict={"model": ["gpt-3.5-turbo-1106"]}
-# filter_dict={"model": ["gpt-3.5-turbo-0613"]}
 # filter_dict={"model": ["gpt-4"]}
-filter_dict = {"tags": ["gpt-35-turbo-16k", "gpt-3.5-turbo-16k"]}
+filter_dict = {"tags": ["gpt-4o-mini"]}
 
 
 def create_teachable_agent(reset_db=False, verbosity=0):

diff --git a/test/agentchat/contrib/capabilities/test_transform_messages.py b/test/agentchat/contrib/capabilities/test_transform_messages.py
@@ -33,7 +33,7 @@ def test_transform_messages_capability():
             OAI_CONFIG_LIST,
             KEY_LOC,
             filter_dict={
-                "model": "gpt-3.5-turbo",
+                "model": "gpt-4o-mini",
             },
         )
 

diff --git a/test/agentchat/contrib/test_agent_builder.py b/test/agentchat/contrib/test_agent_builder.py
@@ -51,8 +51,8 @@ def test_build():
     builder = AgentBuilder(
         config_file_or_env=OAI_CONFIG_LIST,
         config_file_location=KEY_LOC,
-        builder_model_tags=["gpt-4"],
-        agent_model_tags=["gpt-4"],
+        builder_model_tags=["gpt-4o"],
+        agent_model_tags=["gpt-4o"],
     )
     building_task = (
         "Find a paper on arxiv by programming, and analyze its application in some domain. "
@@ -83,8 +83,8 @@ def test_build_from_library():
     builder = AgentBuilder(
         config_file_or_env=OAI_CONFIG_LIST,
         config_file_location=KEY_LOC,
-        builder_model_tags=["gpt-4"],
-        agent_model_tags=["gpt-4"],
+        builder_model_tags=["gpt-4o"],
+        agent_model_tags=["gpt-4o"],
     )
     building_task = (
         "Find a paper on arxiv by programming, and analyze its application in some domain. "
@@ -136,8 +136,8 @@ def test_save():
     builder = AgentBuilder(
         config_file_or_env=OAI_CONFIG_LIST,
         config_file_location=KEY_LOC,
-        builder_model_tags=["gpt-4"],
-        agent_model_tags=["gpt-4"],
+        builder_model_tags=["gpt-4o"],
+        agent_model_tags=["gpt-4o"],
     )
     building_task = (
         "Find a paper on arxiv by programming, and analyze its application in some domain. "
@@ -175,8 +175,8 @@ def test_load():
         config_file_location=KEY_LOC,
         # builder_model=["gpt-4", "gpt-4-1106-preview"],
         # agent_model=["gpt-4", "gpt-4-1106-preview"],
-        builder_model_tags=["gpt-4"],
-        agent_model_tags=["gpt-4"],
+        builder_model_tags=["gpt-4o"],
+        agent_model_tags=["gpt-4o"],
     )
 
     config_save_path = f"{here}/example_test_agent_builder_config.json"
@@ -204,8 +204,8 @@ def test_clear_agent():
     builder = AgentBuilder(
         config_file_or_env=OAI_CONFIG_LIST,
         config_file_location=KEY_LOC,
-        builder_model_tags=["gpt-4"],
-        agent_model_tags=["gpt-4"],
+        builder_model_tags=["gpt-4o"],
+        agent_model_tags=["gpt-4o"],
     )
 
     config_save_path = f"{here}/example_test_agent_builder_config.json"

diff --git a/test/agentchat/contrib/test_gpt_assistant.py b/test/agentchat/contrib/test_gpt_assistant.py
@@ -40,9 +40,6 @@
                 "gpt-4-turbo-preview",
                 "gpt-4-0125-preview",
                 "gpt-4-1106-preview",
-                "gpt-3.5-turbo",
-                "gpt-3.5-turbo-0125",
-                "gpt-3.5-turbo-1106",
             ],
         },
     )

diff --git a/test/agentchat/contrib/test_llamaindex_conversable_agent.py b/test/agentchat/contrib/test_llamaindex_conversable_agent.py
@@ -48,7 +48,7 @@ def test_group_chat_with_llama_index_conversable_agent(chat_mock: MagicMock) ->
     Each agent is set to describe an image in a unique style, but the chat should not exceed the specified max_rounds.
     """
     llm = OpenAI(
-        model="gpt-4",
+        model="gpt-4o",
         temperature=0.0,
         api_key=openaiKey,
     )

diff --git a/test/agentchat/contrib/test_reasoning_agent.py b/test/agentchat/contrib/test_reasoning_agent.py
@@ -43,7 +43,7 @@ def think_node():
 @pytest.fixture
 def reasoning_agent():
     """Create a ReasoningAgent instance for testing"""
-    config_list = [{"model": "gpt-4", "api_key": "fake_key"}]
+    config_list = [{"model": "gpt-4o", "api_key": "fake_key"}]
     llm_config = {"config_list": config_list, "temperature": 0}
     return ReasoningAgent("reasoning_agent", llm_config=llm_config)
 
@@ -164,7 +164,10 @@ def test_reasoning_agent_answer():
 
 def helper_test_reasoning_agent_answer(max_depth, beam_size, answer_approach):
     """Test that ReasoningAgent properly terminates when TERMINATE is received"""
-    mock_config = {"config_list": [{"model": "gpt-4", "api_key": "fake", "base_url": "0.0.0.0:8000"}], "temperature": 0}
+    mock_config = {
+        "config_list": [{"model": "gpt-4o", "api_key": "fake", "base_url": "0.0.0.0:8000"}],
+        "temperature": 0,
+    }
     with patch("autogen.agentchat.conversable_agent.ConversableAgent.generate_oai_reply") as mock_oai_reply:
         agent = ReasoningAgent(
             "test_agent",

diff --git a/test/agentchat/test_agent_logging.py b/test/agentchat/test_agent_logging.py
@@ -50,7 +50,7 @@
     config_list = autogen.config_list_from_json(
         OAI_CONFIG_LIST,
         filter_dict={
-            "tags": ["gpt-3.5-turbo"],
+            "tags": ["gpt-4o-mini"],
         },
         file_location=KEY_LOC,
     )

diff --git a/test/agentchat/test_agent_usage.py b/test/agentchat/test_agent_usage.py
@@ -32,44 +32,44 @@ def test_gathering():
         system_message="You are a helpful assistant.",
         llm_config={
             "config_list": config_list,
-            "model": "gpt-3.5-turbo-0613",
+            "model": "gpt-4o-mini",
         },
     )
     assistant2 = AssistantAgent(
         "assistant",
         system_message="You are a helpful assistant.",
         llm_config={
             "config_list": config_list,
-            "model": "gpt-3.5-turbo-0613",
+            "model": "gpt-4o-mini",
         },
     )
     assistant3 = AssistantAgent(
         "assistant",
         system_message="You are a helpful assistant.",
         llm_config={
             "config_list": config_list,
-            "model": "gpt-3.5-turbo-0613",
+            "model": "gpt-4o",
         },
     )
 
     assistant1.client.total_usage_summary = {
         "total_cost": 0.1,
-        "gpt-35-turbo": {"cost": 0.1, "prompt_tokens": 100, "completion_tokens": 200, "total_tokens": 300},
+        "gpt-4o-mini": {"cost": 0.1, "prompt_tokens": 100, "completion_tokens": 200, "total_tokens": 300},
     }
     assistant2.client.total_usage_summary = {
         "total_cost": 0.2,
-        "gpt-35-turbo": {"cost": 0.2, "prompt_tokens": 100, "completion_tokens": 200, "total_tokens": 300},
+        "gpt-4o-mini": {"cost": 0.2, "prompt_tokens": 100, "completion_tokens": 200, "total_tokens": 300},
     }
     assistant3.client.total_usage_summary = {
         "total_cost": 0.3,
-        "gpt-4": {"cost": 0.3, "prompt_tokens": 100, "completion_tokens": 200, "total_tokens": 300},
+        "gpt-4o": {"cost": 0.3, "prompt_tokens": 100, "completion_tokens": 200, "total_tokens": 300},
     }
 
     total_usage = gather_usage_summary([assistant1, assistant2, assistant3])
 
     assert round(total_usage["usage_including_cached_inference"]["total_cost"], 8) == 0.6
-    assert round(total_usage["usage_including_cached_inference"]["gpt-35-turbo"]["cost"], 8) == 0.3
-    assert round(total_usage["usage_including_cached_inference"]["gpt-4"]["cost"], 8) == 0.3
+    assert round(total_usage["usage_including_cached_inference"]["gpt-4o-mini"]["cost"], 8) == 0.3
+    assert round(total_usage["usage_including_cached_inference"]["gpt-4o"]["cost"], 8) == 0.3
 
     # test when agent doesn't have client
     user_proxy = UserProxyAgent(
@@ -91,7 +91,7 @@ def test_agent_usage():
     config_list = autogen.config_list_from_json(
         OAI_CONFIG_LIST,
         file_location=KEY_LOC,
-        filter_dict={"tags": ["gpt-3.5-turbo"]},
+        filter_dict={"tags": ["gpt-4o-mini"]},
     )
     assistant = AssistantAgent(
         "assistant",

diff --git a/test/agentchat/test_assistant_agent.py b/test/agentchat/test_assistant_agent.py
@@ -33,7 +33,7 @@ def test_ai_user_proxy_agent():
     config_list = autogen.config_list_from_json(
         OAI_CONFIG_LIST,
         file_location=KEY_LOC,
-        filter_dict={"tags": ["gpt-3.5-turbo"]},
+        filter_dict={"tags": ["gpt-4o-mini"]},
     )
     assistant = AssistantAgent(
         "assistant",
@@ -72,7 +72,7 @@ def test_gpt35(human_input_mode="NEVER", max_consecutive_auto_reply=5):
     config_list = autogen.config_list_from_json(
         OAI_CONFIG_LIST,
         file_location=KEY_LOC,
-        filter_dict={"tags": ["gpt-3.5-turbo", "gpt-3.5-turbo-16k"]},
+        filter_dict={"tags": ["gpt-4o-mini"]},
     )
     llm_config = {
         "cache_seed": 42,
@@ -116,7 +116,7 @@ def test_create_execute_script(human_input_mode="NEVER", max_consecutive_auto_re
     config_list = autogen.config_list_from_json(
         OAI_CONFIG_LIST,
         file_location=KEY_LOC,
-        filter_dict={"tags": ["gpt-3.5-turbo"]},
+        filter_dict={"tags": ["gpt-4o-mini"]},
     )
     conversations = {}
     # autogen.ChatCompletion.start_logging(conversations)
@@ -170,7 +170,7 @@ def test_tsp(human_input_mode="NEVER", max_consecutive_auto_reply=2):
         OAI_CONFIG_LIST,
         file_location=KEY_LOC,
         filter_dict={
-            "tags": ["gpt-4", "gpt-4-32k"],
+            "tags": ["gpt-4o"],
         },
     )
     hard_questions = [

diff --git a/test/agentchat/test_async_chats.py b/test/agentchat/test_async_chats.py
@@ -23,10 +23,10 @@
 @pytest.mark.skipif(skip_openai, reason="requested to skip openai tests")
 @pytest.mark.asyncio
 async def test_async_chats():
-    config_list_35 = autogen.config_list_from_json(
+    config_list_4omini = autogen.config_list_from_json(
         OAI_CONFIG_LIST,
         file_location=KEY_LOC,
-        filter_dict={"tags": ["gpt-3.5-turbo"]},
+        filter_dict={"tags": ["gpt-4o-mini"]},
     )
 
     financial_tasks = [
@@ -39,16 +39,16 @@ async def test_async_chats():
 
     financial_assistant_1 = AssistantAgent(
         name="Financial_assistant_1",
-        llm_config={"config_list": config_list_35},
+        llm_config={"config_list": config_list_4omini},
         system_message="You are a knowledgeable AI Assistant. Reply TERMINATE when everything is done.",
     )
     financial_assistant_2 = AssistantAgent(
         name="Financial_assistant_2",
-        llm_config={"config_list": config_list_35},
+        llm_config={"config_list": config_list_4omini},
     )
     writer = AssistantAgent(
         name="Writer",
-        llm_config={"config_list": config_list_35},
+        llm_config={"config_list": config_list_4omini},
         is_termination_msg=lambda x: x.get("content", "").find("TERMINATE") >= 0,
         system_message="""
             You are a professional writer, known for

diff --git a/test/agentchat/test_async_get_human_input.py b/test/agentchat/test_async_get_human_input.py
@@ -23,7 +23,7 @@
 @pytest.mark.skipif(skip_openai, reason=reason)
 @pytest.mark.asyncio
 async def test_async_get_human_input():
-    config_list = autogen.config_list_from_json(OAI_CONFIG_LIST, KEY_LOC, filter_dict={"tags": ["gpt-3.5-turbo"]})
+    config_list = autogen.config_list_from_json(OAI_CONFIG_LIST, KEY_LOC, filter_dict={"tags": ["gpt-4o-mini"]})
 
     # create an AssistantAgent instance named "assistant"
     assistant = autogen.AssistantAgent(
@@ -50,7 +50,7 @@ async def test_async_get_human_input():
 @pytest.mark.skipif(skip_openai, reason=reason)
 @pytest.mark.asyncio
 async def test_async_max_turn():
-    config_list = autogen.config_list_from_json(OAI_CONFIG_LIST, KEY_LOC, filter_dict={"tags": ["gpt-3.5-turbo"]})
+    config_list = autogen.config_list_from_json(OAI_CONFIG_LIST, KEY_LOC, filter_dict={"tags": ["gpt-4o-mini"]})
 
     # create an AssistantAgent instance named "assistant"
     assistant = autogen.AssistantAgent(

diff --git a/test/agentchat/test_cache_agent.py b/test/agentchat/test_cache_agent.py
@@ -120,7 +120,7 @@ def run_conversation(cache_seed, human_input_mode="NEVER", max_consecutive_auto_
         OAI_CONFIG_LIST,
         file_location=KEY_LOC,
         filter_dict={
-            "tags": ["gpt-3.5-turbo", "gpt-3.5-turbo-16k"],
+            "tags": ["gpt-4o-mini"],
         },
     )
     llm_config = {
@@ -167,7 +167,7 @@ def run_groupchat_conversation(cache, human_input_mode="NEVER", max_consecutive_
         OAI_CONFIG_LIST,
         file_location=KEY_LOC,
         filter_dict={
-            "tags": ["gpt-3.5-turbo", "gpt-3.5-turbo-16k"],
+            "tags": ["gpt-4o-mini"],
         },
     )
     llm_config = {