From 861065fb79effb35b271eb7a1070d79a12004089 Mon Sep 17 00:00:00 2001 From: Mark Sze <66362098+marklysze@users.noreply.github.com> Date: Wed, 20 Nov 2024 09:47:25 +1100 Subject: [PATCH 1/5] Update test_web_surfer.py for incorrect web url --- test/agentchat/contrib/test_web_surfer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/agentchat/contrib/test_web_surfer.py b/test/agentchat/contrib/test_web_surfer.py index 94c3013005..4d84d51ee4 100755 --- a/test/agentchat/contrib/test_web_surfer.py +++ b/test/agentchat/contrib/test_web_surfer.py @@ -21,7 +21,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402 -BLOG_POST_URL = "https://ag2ai.github.io/autogen/blog/2023/04/21/LLM-tuning-math" +BLOG_POST_URL = "https://ag2ai.github.io/ag2/blog/2023/04/21/LLM-tuning-math" BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen" BING_QUERY = "Microsoft" From 59ccaa214e8be078fcf200903fb3a3287b86160e Mon Sep 17 00:00:00 2001 From: Mark Sze Date: Tue, 19 Nov 2024 23:12:36 +0000 Subject: [PATCH 2/5] Update config on Cost Token Tracking notebook --- notebook/agentchat_cost_token_tracking.ipynb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/notebook/agentchat_cost_token_tracking.ipynb b/notebook/agentchat_cost_token_tracking.ipynb index 92ee8d1b97..bb9eed06b4 100644 --- a/notebook/agentchat_cost_token_tracking.ipynb +++ b/notebook/agentchat_cost_token_tracking.ipynb @@ -68,7 +68,7 @@ "config_list = autogen.config_list_from_json(\n", " \"OAI_CONFIG_LIST\",\n", " filter_dict={\n", - " \"model\": [\"gpt-3.5-turbo\", \"gpt-3.5-turbo-16k\"], # comment out to get all\n", + " \"tags\": [\"gpt-4o\", \"gpt-4o-mini\"], # comment out to get all\n", " },\n", ")" ] @@ -83,17 +83,17 @@ "```python\n", "config_list = [\n", " {\n", - " \"model\": \"gpt-3.5-turbo\",\n", + " \"model\": \"gpt-4o\",\n", " \"api_key\": \"\",\n", - " \"tags\": [\"gpt-3.5-turbo\"],\n", - " }, # OpenAI API endpoint for gpt-3.5-turbo\n", + " \"tags\": [\"gpt-4o\"],\n", + " }, # OpenAI API endpoint for gpt-4o\n", " {\n", - " \"model\": \"gpt-35-turbo-0613\", # 0613 or newer is needed to use functions\n", + " \"model\": \"gpt-4o-mini\",\n", " \"base_url\": \"\", \n", " \"api_type\": \"azure\", \n", - " \"api_version\": \"2024-02-01\", # 2023-07-01-preview or newer is needed to use functions\n", + " \"api_version\": \"2024-07-18\",\n", " \"api_key\": \"\",\n", - " \"tags\": [\"gpt-3.5-turbo\", \"0613\"],\n", + " \"tags\": [\"gpt-4o-mini\", \"20240718\"],\n", " }\n", "]\n", "```\n", From 25dd5b8fe1a680aa9eaca5133fcb4c6ca3b32fdb Mon Sep 17 00:00:00 2001 From: Mark Sze Date: Tue, 19 Nov 2024 23:20:16 +0000 Subject: [PATCH 3/5] Run Cost Token Tracking notebook with results --- notebook/agentchat_cost_token_tracking.ipynb | 142 ++++++++++++------- 1 file changed, 90 insertions(+), 52 deletions(-) diff --git a/notebook/agentchat_cost_token_tracking.ipynb b/notebook/agentchat_cost_token_tracking.ipynb index bb9eed06b4..297987b6ab 100644 --- a/notebook/agentchat_cost_token_tracking.ipynb +++ b/notebook/agentchat_cost_token_tracking.ipynb @@ -58,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -110,14 +110,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.00020600000000000002\n" + "0.0011125\n" ] } ], @@ -139,14 +139,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Price: 109\n" + "Price: 0.144\n" ] } ], @@ -177,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -198,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -207,20 +207,20 @@ "text": [ "----------------------------------------------------------------------------------------------------\n", "Usage summary excluding cached usage: \n", - "Total cost: 0.00023\n", - "* Model 'gpt-35-turbo': cost: 0.00023, prompt_tokens: 25, completion_tokens: 142, total_tokens: 167\n", + "Total cost: 0.154\n", + "* Model 'gpt-4o-2024-08-06': cost: 0.154, prompt_tokens: 25, completion_tokens: 129, total_tokens: 154\n", "\n", "All completions are non-cached: the total cost with cached completions is the same as actual cost.\n", "----------------------------------------------------------------------------------------------------\n", "----------------------------------------------------------------------------------------------------\n", "Usage summary excluding cached usage: \n", - "Total cost: 0.00023\n", - "* Model 'gpt-35-turbo': cost: 0.00023, prompt_tokens: 25, completion_tokens: 142, total_tokens: 167\n", + "Total cost: 0.154\n", + "* Model 'gpt-4o-2024-08-06': cost: 0.154, prompt_tokens: 25, completion_tokens: 129, total_tokens: 154\n", "----------------------------------------------------------------------------------------------------\n", "----------------------------------------------------------------------------------------------------\n", "Usage summary including cached usage: \n", - "Total cost: 0.00023\n", - "* Model 'gpt-35-turbo': cost: 0.00023, prompt_tokens: 25, completion_tokens: 142, total_tokens: 167\n", + "Total cost: 0.154\n", + "* Model 'gpt-4o-2024-08-06': cost: 0.154, prompt_tokens: 25, completion_tokens: 129, total_tokens: 154\n", "----------------------------------------------------------------------------------------------------\n" ] } @@ -236,15 +236,15 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'total_cost': 0.0002255, 'gpt-35-turbo': {'cost': 0.0002255, 'prompt_tokens': 25, 'completion_tokens': 142, 'total_tokens': 167}}\n", - "{'total_cost': 0.0002255, 'gpt-35-turbo': {'cost': 0.0002255, 'prompt_tokens': 25, 'completion_tokens': 142, 'total_tokens': 167}}\n" + "{'total_cost': 0.154, 'gpt-4o-2024-08-06': {'cost': 0.154, 'prompt_tokens': 25, 'completion_tokens': 129, 'total_tokens': 154}}\n", + "{'total_cost': 0.154, 'gpt-4o-2024-08-06': {'cost': 0.154, 'prompt_tokens': 25, 'completion_tokens': 129, 'total_tokens': 154}}\n" ] } ], @@ -256,7 +256,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -265,12 +265,12 @@ "text": [ "----------------------------------------------------------------------------------------------------\n", "Usage summary excluding cached usage: \n", - "Total cost: 0.00023\n", - "* Model 'gpt-35-turbo': cost: 0.00023, prompt_tokens: 25, completion_tokens: 142, total_tokens: 167\n", + "Total cost: 0.154\n", + "* Model 'gpt-4o-2024-08-06': cost: 0.154, prompt_tokens: 25, completion_tokens: 129, total_tokens: 154\n", "\n", "Usage summary including cached usage: \n", - "Total cost: 0.00045\n", - "* Model 'gpt-35-turbo': cost: 0.00045, prompt_tokens: 50, completion_tokens: 284, total_tokens: 334\n", + "Total cost: 0.308\n", + "* Model 'gpt-4o-2024-08-06': cost: 0.308, prompt_tokens: 50, completion_tokens: 258, total_tokens: 308\n", "----------------------------------------------------------------------------------------------------\n" ] } @@ -284,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -303,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -314,8 +314,8 @@ "No actual cost incurred (all completions are using cache).\n", "\n", "Usage summary including cached usage: \n", - "Total cost: 0.00023\n", - "* Model 'gpt-35-turbo': cost: 0.00023, prompt_tokens: 25, completion_tokens: 142, total_tokens: 167\n", + "Total cost: 0.154\n", + "* Model 'gpt-4o-2024-08-06': cost: 0.154, prompt_tokens: 25, completion_tokens: 129, total_tokens: 154\n", "----------------------------------------------------------------------------------------------------\n" ] } @@ -340,7 +340,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -354,19 +354,57 @@ "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ai_user):\n", "\n", - "To find x, we need to take the cube root of 125. The cube root of a number is the number that, when multiplied by itself three times, gives the original number.\n", + "To solve the equation \\(x^3 = 125\\), you need to find the value of \\(x\\) that makes this equation true. \n", + "\n", + "You can solve for \\(x\\) by taking the cube root of both sides of the equation:\n", + "\n", + "\\[\n", + "x = \\sqrt[3]{125}\n", + "\\]\n", "\n", - "In this case, the cube root of 125 is 5 since 5 * 5 * 5 = 125. Therefore, x = 5.\n", + "Since \\(125\\) is \\(5^3\\), the cube root of \\(125\\) is \\(5\\). Thus,\n", + "\n", + "\\[\n", + "x = 5\n", + "\\]\n", + "\n", + "Therefore, the solution to the equation \\(x^3 = 125\\) is \\(x = 5\\).\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33mai_user\u001b[0m (to assistant):\n", "\n", - "That's correct! Well done. The value of x is indeed 5, as you correctly found by taking the cube root of 125. Keep up the good work!\n", + "Can you help me solve the equation \\(2x^2 - 8x = 0\\)?\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ai_user):\n", "\n", - "Thank you! I'm glad I could help. If you have any more questions, feel free to ask!\n", + "Certainly! To solve the equation \\(2x^2 - 8x = 0\\), you can start by factoring the expression on the left-hand side.\n", + "\n", + "First, factor out the greatest common factor, which is \\(2x\\):\n", + "\n", + "\\[\n", + "2x(x - 4) = 0\n", + "\\]\n", + "\n", + "Now, you have a product of two factors equal to zero. According to the zero product property, if the product of two factors is zero, at least one of the factors must be zero. So, you set each factor equal to zero and solve for \\(x\\):\n", + "\n", + "1. \\(2x = 0\\)\n", + "\n", + " Divide both sides by 2 to solve for \\(x\\):\n", + "\n", + " \\[\n", + " x = 0\n", + " \\]\n", + "\n", + "2. \\(x - 4 = 0\\)\n", + "\n", + " Add 4 to both sides to solve for \\(x\\):\n", + "\n", + " \\[\n", + " x = 4\n", + " \\]\n", + "\n", + "So, the solutions to the equation \\(2x^2 - 8x = 0\\) are \\(x = 0\\) and \\(x = 4\\).\n", "\n", "--------------------------------------------------------------------------------\n" ] @@ -374,10 +412,10 @@ { "data": { "text/plain": [ - "ChatResult(chat_id=None, chat_history=[{'content': '$x^3=125$. What is x?', 'role': 'assistant'}, {'content': 'To find x, we need to take the cube root of 125. The cube root of a number is the number that, when multiplied by itself three times, gives the original number.\\n\\nIn this case, the cube root of 125 is 5 since 5 * 5 * 5 = 125. Therefore, x = 5.', 'role': 'user'}, {'content': \"That's correct! Well done. The value of x is indeed 5, as you correctly found by taking the cube root of 125. Keep up the good work!\", 'role': 'assistant'}, {'content': \"Thank you! I'm glad I could help. If you have any more questions, feel free to ask!\", 'role': 'user'}], summary=\"Thank you! I'm glad I could help. If you have any more questions, feel free to ask!\", cost={'usage_including_cached_inference': {'total_cost': 0.000333, 'gpt-35-turbo': {'cost': 0.000333, 'prompt_tokens': 282, 'completion_tokens': 128, 'total_tokens': 410}}, 'usage_excluding_cached_inference': {'total_cost': 0.000333, 'gpt-35-turbo': {'cost': 0.000333, 'prompt_tokens': 282, 'completion_tokens': 128, 'total_tokens': 410}}}, human_input=[])" + "ChatResult(chat_id=None, chat_history=[{'content': '$x^3=125$. What is x?', 'role': 'assistant', 'name': 'ai_user'}, {'content': 'To solve the equation \\\\(x^3 = 125\\\\), you need to find the value of \\\\(x\\\\) that makes this equation true. \\n\\nYou can solve for \\\\(x\\\\) by taking the cube root of both sides of the equation:\\n\\n\\\\[\\nx = \\\\sqrt[3]{125}\\n\\\\]\\n\\nSince \\\\(125\\\\) is \\\\(5^3\\\\), the cube root of \\\\(125\\\\) is \\\\(5\\\\). Thus,\\n\\n\\\\[\\nx = 5\\n\\\\]\\n\\nTherefore, the solution to the equation \\\\(x^3 = 125\\\\) is \\\\(x = 5\\\\).', 'role': 'user', 'name': 'assistant'}, {'content': 'Can you help me solve the equation \\\\(2x^2 - 8x = 0\\\\)?', 'role': 'assistant', 'name': 'ai_user'}, {'content': 'Certainly! To solve the equation \\\\(2x^2 - 8x = 0\\\\), you can start by factoring the expression on the left-hand side.\\n\\nFirst, factor out the greatest common factor, which is \\\\(2x\\\\):\\n\\n\\\\[\\n2x(x - 4) = 0\\n\\\\]\\n\\nNow, you have a product of two factors equal to zero. According to the zero product property, if the product of two factors is zero, at least one of the factors must be zero. So, you set each factor equal to zero and solve for \\\\(x\\\\):\\n\\n1. \\\\(2x = 0\\\\)\\n\\n Divide both sides by 2 to solve for \\\\(x\\\\):\\n\\n \\\\[\\n x = 0\\n \\\\]\\n\\n2. \\\\(x - 4 = 0\\\\)\\n\\n Add 4 to both sides to solve for \\\\(x\\\\):\\n\\n \\\\[\\n x = 4\\n \\\\]\\n\\nSo, the solutions to the equation \\\\(2x^2 - 8x = 0\\\\) are \\\\(x = 0\\\\) and \\\\(x = 4\\\\).', 'role': 'user', 'name': 'assistant'}], summary='Certainly! To solve the equation \\\\(2x^2 - 8x = 0\\\\), you can start by factoring the expression on the left-hand side.\\n\\nFirst, factor out the greatest common factor, which is \\\\(2x\\\\):\\n\\n\\\\[\\n2x(x - 4) = 0\\n\\\\]\\n\\nNow, you have a product of two factors equal to zero. According to the zero product property, if the product of two factors is zero, at least one of the factors must be zero. So, you set each factor equal to zero and solve for \\\\(x\\\\):\\n\\n1. \\\\(2x = 0\\\\)\\n\\n Divide both sides by 2 to solve for \\\\(x\\\\):\\n\\n \\\\[\\n x = 0\\n \\\\]\\n\\n2. \\\\(x - 4 = 0\\\\)\\n\\n Add 4 to both sides to solve for \\\\(x\\\\):\\n\\n \\\\[\\n x = 4\\n \\\\]\\n\\nSo, the solutions to the equation \\\\(2x^2 - 8x = 0\\\\) are \\\\(x = 0\\\\) and \\\\(x = 4\\\\).', cost={'usage_including_cached_inference': {'total_cost': 0.7649999999999999, 'gpt-4o-2024-08-06': {'cost': 0.7649999999999999, 'prompt_tokens': 390, 'completion_tokens': 375, 'total_tokens': 765}}, 'usage_excluding_cached_inference': {'total_cost': 0.7649999999999999, 'gpt-4o-2024-08-06': {'cost': 0.7649999999999999, 'prompt_tokens': 390, 'completion_tokens': 375, 'total_tokens': 765}}}, human_input=[])" ] }, - "execution_count": 9, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -415,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -425,8 +463,8 @@ "Agent 'ai_user':\n", "----------------------------------------------------------------------------------------------------\n", "Usage summary excluding cached usage: \n", - "Total cost: 0.00011\n", - "* Model 'gpt-35-turbo': cost: 0.00011, prompt_tokens: 114, completion_tokens: 35, total_tokens: 149\n", + "Total cost: 0.193\n", + "* Model 'gpt-4o-2024-08-06': cost: 0.193, prompt_tokens: 172, completion_tokens: 21, total_tokens: 193\n", "\n", "All completions are non-cached: the total cost with cached completions is the same as actual cost.\n", "----------------------------------------------------------------------------------------------------\n", @@ -434,8 +472,8 @@ "Agent 'assistant':\n", "----------------------------------------------------------------------------------------------------\n", "Usage summary excluding cached usage: \n", - "Total cost: 0.00022\n", - "* Model 'gpt-35-turbo': cost: 0.00022, prompt_tokens: 168, completion_tokens: 93, total_tokens: 261\n", + "Total cost: 0.572\n", + "* Model 'gpt-4o-2024-08-06': cost: 0.572, prompt_tokens: 218, completion_tokens: 354, total_tokens: 572\n", "\n", "All completions are non-cached: the total cost with cached completions is the same as actual cost.\n", "----------------------------------------------------------------------------------------------------\n" @@ -450,7 +488,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -474,17 +512,17 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Actual usage summary for assistant (excluding completion from cache): {'total_cost': 0.0002235, 'gpt-35-turbo': {'cost': 0.0002235, 'prompt_tokens': 168, 'completion_tokens': 93, 'total_tokens': 261}}\n", - "Total usage summary for assistant (including completion from cache): {'total_cost': 0.0002235, 'gpt-35-turbo': {'cost': 0.0002235, 'prompt_tokens': 168, 'completion_tokens': 93, 'total_tokens': 261}}\n", - "Actual usage summary for ai_user_proxy: {'total_cost': 0.0001095, 'gpt-35-turbo': {'cost': 0.0001095, 'prompt_tokens': 114, 'completion_tokens': 35, 'total_tokens': 149}}\n", - "Total usage summary for ai_user_proxy: {'total_cost': 0.0001095, 'gpt-35-turbo': {'cost': 0.0001095, 'prompt_tokens': 114, 'completion_tokens': 35, 'total_tokens': 149}}\n", + "Actual usage summary for assistant (excluding completion from cache): {'total_cost': 0.572, 'gpt-4o-2024-08-06': {'cost': 0.572, 'prompt_tokens': 218, 'completion_tokens': 354, 'total_tokens': 572}}\n", + "Total usage summary for assistant (including completion from cache): {'total_cost': 0.572, 'gpt-4o-2024-08-06': {'cost': 0.572, 'prompt_tokens': 218, 'completion_tokens': 354, 'total_tokens': 572}}\n", + "Actual usage summary for ai_user_proxy: {'total_cost': 0.193, 'gpt-4o-2024-08-06': {'cost': 0.193, 'prompt_tokens': 172, 'completion_tokens': 21, 'total_tokens': 193}}\n", + "Total usage summary for ai_user_proxy: {'total_cost': 0.193, 'gpt-4o-2024-08-06': {'cost': 0.193, 'prompt_tokens': 172, 'completion_tokens': 21, 'total_tokens': 193}}\n", "Actual usage summary for user_proxy: None\n", "Total usage summary for user_proxy: None\n" ] @@ -503,20 +541,20 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'total_cost': 0.000333,\n", - " 'gpt-35-turbo': {'cost': 0.000333,\n", - " 'prompt_tokens': 282,\n", - " 'completion_tokens': 128,\n", - " 'total_tokens': 410}}" + "{'total_cost': 0.7649999999999999,\n", + " 'gpt-4o-2024-08-06': {'cost': 0.7649999999999999,\n", + " 'prompt_tokens': 390,\n", + " 'completion_tokens': 375,\n", + " 'total_tokens': 765}}" ] }, - "execution_count": 13, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -535,7 +573,7 @@ ] }, "kernelspec": { - "display_name": "msft", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -549,7 +587,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" + "version": "3.11.10" } }, "nbformat": 4, From 1af8af24f8cab686e12ceb3e596d4255de2d0801 Mon Sep 17 00:00:00 2001 From: Mark Sze <66362098+marklysze@users.noreply.github.com> Date: Wed, 20 Nov 2024 10:44:53 +1100 Subject: [PATCH 4/5] Update test_web_surfer.py --- test/agentchat/contrib/test_web_surfer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/agentchat/contrib/test_web_surfer.py b/test/agentchat/contrib/test_web_surfer.py index 4d84d51ee4..0111c2a03c 100755 --- a/test/agentchat/contrib/test_web_surfer.py +++ b/test/agentchat/contrib/test_web_surfer.py @@ -22,7 +22,7 @@ from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402 BLOG_POST_URL = "https://ag2ai.github.io/ag2/blog/2023/04/21/LLM-tuning-math" -BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen" +BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AG2" BING_QUERY = "Microsoft" try: @@ -54,7 +54,7 @@ def test_web_surfer() -> None: page_size = 4096 web_surfer = WebSurferAgent( "web_surfer", - llm_config={"model": "gpt-4", "config_list": []}, + llm_config={"model": "gpt-4o", "config_list": []}, browser_config={"viewport_size": page_size}, ) @@ -110,7 +110,7 @@ def test_web_surfer_oai() -> None: llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": 42} # adding Azure name variations to the model list - model = ["gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-16k"] + model = ["gpt-4o", "gpt-4o-mini"] model += [m.replace(".", "") for m in model] summarizer_llm_config = { @@ -160,7 +160,7 @@ def test_web_surfer_bing() -> None: llm_config={ "config_list": [ { - "model": "gpt-3.5-turbo-16k", + "model": "gpt-4o", "api_key": "sk-PLACEHOLDER_KEY", } ] From 0492db4568e872d0c624a502675b701d2ce3fca8 Mon Sep 17 00:00:00 2001 From: Mark Sze Date: Wed, 20 Nov 2024 00:13:07 +0000 Subject: [PATCH 5/5] URL fixes and browser utils test fix --- .github/PULL_REQUEST_TEMPLATE.md | 2 +- autogen/agentchat/contrib/agent_eval/README.md | 6 ++---- notebook/JSON_mode_example.ipynb | 2 +- notebook/agentchat_MathChat.ipynb | 2 +- notebook/agenteval_cq_math.ipynb | 2 +- notebook/autogen_uniformed_api_calling.ipynb | 12 ++++++------ notebook/oai_chatgpt_gpt4.ipynb | 2 +- test/test_browser_utils.py | 6 +++--- website/blog/2023-11-20-AgentEval/index.mdx | 2 +- website/blog/2024-01-25-AutoGenBench/index.mdx | 4 ++-- website/blog/2024-05-24-Agent/index.mdx | 2 +- website/blog/2024-06-21-AgentEval/index.mdx | 4 ++-- website/docs/FAQ.mdx | 4 ++-- website/docs/autogen-studio/getting-started.md | 4 ++-- .../about-using-nonopenai-models.md | 2 +- 15 files changed, 27 insertions(+), 29 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 9f064605c5..1893c28239 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -12,6 +12,6 @@ ## Checks -- [ ] I've included any doc changes needed for https://ag2ai.github.io/autogen/. See https://ag2ai.github.io/ag2/docs/Contribute#documentation to build and test documentation locally. +- [ ] I've included any doc changes needed for https://ag2ai.github.io/ag2/. See https://ag2ai.github.io/ag2/docs/Contribute#documentation to build and test documentation locally. - [ ] I've added tests (if relevant) corresponding to the changes introduced in this PR. - [ ] I've made sure all auto checks have passed. diff --git a/autogen/agentchat/contrib/agent_eval/README.md b/autogen/agentchat/contrib/agent_eval/README.md index cd05199aa1..b9a9815e2d 100644 --- a/autogen/agentchat/contrib/agent_eval/README.md +++ b/autogen/agentchat/contrib/agent_eval/README.md @@ -1,9 +1,7 @@ -Agents for running the [AgentEval](https://ag2ai.github.io/autogen/blog/2023/11/20/AgentEval/) pipeline. +Agents for running the [AgentEval](https://ag2ai.github.io/ag2/blog/2023/11/20/AgentEval/) pipeline. AgentEval is a process for evaluating a LLM-based system's performance on a given task. When given a task to evaluate and a few example runs, the critic and subcritic agents create evaluation criteria for evaluating a system's solution. Once the criteria has been created, the quantifier agent can evaluate subsequent task solutions based on the generated criteria. -For more information see: [AgentEval Integration Roadmap](https://github.com/microsoft/autogen/issues/2162) - -See our [blog post](https://ag2ai.github.io/autogen/blog/2024/06/21/AgentEval) for usage examples and general explanations. +See our [blog post](https://ag2ai.github.io/ag2/blog/2024/06/21/AgentEval) for usage examples and general explanations. diff --git a/notebook/JSON_mode_example.ipynb b/notebook/JSON_mode_example.ipynb index 0e8d65d213..3dd6f7510b 100644 --- a/notebook/JSON_mode_example.ipynb +++ b/notebook/JSON_mode_example.ipynb @@ -19,7 +19,7 @@ "\n", "\n", "Please find documentation about this feature in OpenAI [here](https://platform.openai.com/docs/guides/text-generation/json-mode).\n", - "More information about Agent Descriptions is located [here](https://ag2ai.github.io/autogen/blog/2023/12/29/AgentDescriptions/)\n", + "More information about Agent Descriptions is located [here](https://ag2ai.github.io/ag2/blog/2023/12/29/AgentDescriptions/)\n", "\n", "Benefits\n", "- This contribution provides a method to implement precise speaker transitions based on content of the input message. The example can prevent Prompt hacks that use coersive language.\n", diff --git a/notebook/agentchat_MathChat.ipynb b/notebook/agentchat_MathChat.ipynb index 5c6fcc30c8..0bafb6606b 100644 --- a/notebook/agentchat_MathChat.ipynb +++ b/notebook/agentchat_MathChat.ipynb @@ -9,7 +9,7 @@ "\n", "AutoGen offers conversable agents powered by LLM, tool or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation through multi-agent conversation. Please find documentation about this feature [here](https://ag2ai.github.io/ag2/docs/Use-Cases/agent_chat).\n", "\n", - "MathChat is an experimental conversational framework for math problem solving. In this notebook, we demonstrate how to use MathChat to solve math problems. MathChat uses the `AssistantAgent` and `MathUserProxyAgent`, which is similar to the usage of `AssistantAgent` and `UserProxyAgent` in other notebooks (e.g., [Automated Task Solving with Code Generation, Execution & Debugging](https://github.com/ag2ai/ag2/blob/main/notebook/agentchat_auto_feedback_from_code_execution.ipynb)). Essentially, `MathUserProxyAgent` implements a different auto reply mechanism corresponding to the MathChat prompts. You can find more details in the paper [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337) or the [blogpost](https://ag2ai.github.io/autogen/blog/2023/06/28/MathChat).\n", + "MathChat is an experimental conversational framework for math problem solving. In this notebook, we demonstrate how to use MathChat to solve math problems. MathChat uses the `AssistantAgent` and `MathUserProxyAgent`, which is similar to the usage of `AssistantAgent` and `UserProxyAgent` in other notebooks (e.g., [Automated Task Solving with Code Generation, Execution & Debugging](https://github.com/ag2ai/ag2/blob/main/notebook/agentchat_auto_feedback_from_code_execution.ipynb)). Essentially, `MathUserProxyAgent` implements a different auto reply mechanism corresponding to the MathChat prompts. You can find more details in the paper [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337) or the [blogpost](https://ag2ai.github.io/ag2/blog/2023/06/28/MathChat).\n", "\n", "````{=mdx}\n", ":::info Requirements\n", diff --git a/notebook/agenteval_cq_math.ipynb b/notebook/agenteval_cq_math.ipynb index e9dc5ca030..21a59ef952 100644 --- a/notebook/agenteval_cq_math.ipynb +++ b/notebook/agenteval_cq_math.ipynb @@ -17,7 +17,7 @@ "\n", "![AgentEval](https://media.githubusercontent.com/media/ag2ai/ag2/main/website/blog/2023-11-20-AgentEval/img/agenteval-CQ.png)\n", "\n", - "For more detailed explanations, please refer to the accompanying [blog post](https://ag2ai.github.io/autogen/blog/2023/11/20/AgentEval)\n", + "For more detailed explanations, please refer to the accompanying [blog post](https://ag2ai.github.io/ag2/blog/2023/11/20/AgentEval)\n", "\n", "## Requirements\n", "\n", diff --git a/notebook/autogen_uniformed_api_calling.ipynb b/notebook/autogen_uniformed_api_calling.ipynb index 58175b31af..aad19ed078 100644 --- a/notebook/autogen_uniformed_api_calling.ipynb +++ b/notebook/autogen_uniformed_api_calling.ipynb @@ -22,7 +22,7 @@ "\n", "... and more to come!\n", "\n", - "You can also [plug in your local deployed LLM](https://ag2ai.github.io/autogen/blog/2024/01/26/Custom-Models) into AutoGen if needed." + "You can also [plug in your local deployed LLM](https://ag2ai.github.io/ag2/blog/2024/01/26/Custom-Models) into AutoGen if needed." ] }, { @@ -376,11 +376,11 @@ ], "metadata": { "front_matter": { - "description": "Uniform interface to call different LLM.", - "tags": [ - "integration", - "custom model" - ] + "description": "Uniform interface to call different LLM.", + "tags": [ + "integration", + "custom model" + ] }, "kernelspec": { "display_name": "autodev", diff --git a/notebook/oai_chatgpt_gpt4.ipynb b/notebook/oai_chatgpt_gpt4.ipynb index 2e91ab005b..1011083f06 100644 --- a/notebook/oai_chatgpt_gpt4.ipynb +++ b/notebook/oai_chatgpt_gpt4.ipynb @@ -33,7 +33,7 @@ "\n", "In this notebook, we tune OpenAI ChatGPT (both GPT-3.5 and GPT-4) models for math problem solving. We use [the MATH benchmark](https://crfm.stanford.edu/helm/latest/?group=math_chain_of_thought) for measuring mathematical problem solving on competition math problems with chain-of-thoughts style reasoning.\n", "\n", - "Related link: [Blogpost](https://ag2ai.github.io/autogen/blog/2023/04/21/LLM-tuning-math) based on this experiment.\n", + "Related link: [Blogpost](https://ag2ai.github.io/ag2/blog/2023/04/21/LLM-tuning-math) based on this experiment.\n", "\n", "## Requirements\n", "\n", diff --git a/test/test_browser_utils.py b/test/test_browser_utils.py index 30ce662388..73fd619940 100755 --- a/test/test_browser_utils.py +++ b/test/test_browser_utils.py @@ -16,15 +16,15 @@ import requests from agentchat.test_assistant_agent import KEY_LOC # noqa: E402 -BLOG_POST_URL = "https://ag2ai.github.io/autogen/blog/2023/04/21/LLM-tuning-math" -BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen" +BLOG_POST_URL = "https://ag2ai.github.io/ag2/blog/2023/04/21/LLM-tuning-math" +BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AG2" BLOG_POST_STRING = "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?" WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Microsoft" WIKIPEDIA_TITLE = "Microsoft - Wikipedia" WIKIPEDIA_STRING = "Redmond" -PLAIN_TEXT_URL = "https://raw.githubusercontent.com/microsoft/autogen/main/README.md" +PLAIN_TEXT_URL = "https://raw.githubusercontent.com/ag2ai/ag2/main/README.md" IMAGE_URL = "https://github.com/afourney.png" PDF_URL = "https://arxiv.org/pdf/2308.08155.pdf" diff --git a/website/blog/2023-11-20-AgentEval/index.mdx b/website/blog/2023-11-20-AgentEval/index.mdx index b80592102a..4f0e5c9727 100644 --- a/website/blog/2023-11-20-AgentEval/index.mdx +++ b/website/blog/2023-11-20-AgentEval/index.mdx @@ -14,7 +14,7 @@ tags: [LLM, GPT, evaluation, task utility] **TL;DR:** * As a developer of an LLM-powered application, how can you assess the utility it brings to end users while helping them with their tasks? * To shed light on the question above, we introduce `AgentEval` — the first version of the framework to assess the utility of any LLM-powered application crafted to assist users in specific tasks. AgentEval aims to simplify the evaluation process by automatically proposing a set of criteria tailored to the unique purpose of your application. This allows for a comprehensive assessment, quantifying the utility of your application against the suggested criteria. -* We demonstrate how `AgentEval` work using [math problems dataset](https://ag2ai.github.io/autogen/blog/2023/06/28/MathChat) as an example in the [following notebook](https://github.com/ag2ai/ag2/blob/main/notebook/agenteval_cq_math.ipynb). Any feedback would be useful for future development. Please contact us on our [Discord](http://aka.ms/autogen-dc). +* We demonstrate how `AgentEval` work using [math problems dataset](https://ag2ai.github.io/ag2/blog/2023/06/28/MathChat) as an example in the [following notebook](https://github.com/ag2ai/ag2/blob/main/notebook/agenteval_cq_math.ipynb). Any feedback would be useful for future development. Please contact us on our [Discord](http://aka.ms/autogen-dc). ## Introduction diff --git a/website/blog/2024-01-25-AutoGenBench/index.mdx b/website/blog/2024-01-25-AutoGenBench/index.mdx index b2d8b68fe5..d2a4e8b541 100644 --- a/website/blog/2024-01-25-AutoGenBench/index.mdx +++ b/website/blog/2024-01-25-AutoGenBench/index.mdx @@ -42,7 +42,7 @@ autogenbench tabulate Results/human_eval_two_agents ## Introduction -Measurement and evaluation are core components of every major AI or ML research project. The same is true for AutoGen. To this end, today we are releasing AutoGenBench, a standalone command line tool that we have been using to guide development of AutoGen. Conveniently, AutoGenBench handles: downloading, configuring, running, and reporting results of agents on various public benchmark datasets. In addition to reporting top-line numbers, each AutoGenBench run produces a comprehensive set of logs and telemetry that can be used for debugging, profiling, computing custom metrics, and as input to [AgentEval](https://ag2ai.github.io/autogen/blog/2023/11/20/AgentEval). In the remainder of this blog post, we outline core design principles for AutoGenBench (key to understanding its operation); present a guide to installing and running AutoGenBench; outline a roadmap for evaluation; and conclude with an open call for contributions. +Measurement and evaluation are core components of every major AI or ML research project. The same is true for AutoGen. To this end, today we are releasing AutoGenBench, a standalone command line tool that we have been using to guide development of AutoGen. Conveniently, AutoGenBench handles: downloading, configuring, running, and reporting results of agents on various public benchmark datasets. In addition to reporting top-line numbers, each AutoGenBench run produces a comprehensive set of logs and telemetry that can be used for debugging, profiling, computing custom metrics, and as input to [AgentEval](https://ag2ai.github.io/ag2/blog/2023/11/20/AgentEval). In the remainder of this blog post, we outline core design principles for AutoGenBench (key to understanding its operation); present a guide to installing and running AutoGenBench; outline a roadmap for evaluation; and conclude with an open call for contributions. ## Design Principles @@ -52,7 +52,7 @@ AutoGenBench is designed around three core design principles. Knowing these prin - **Isolation:** Agents interact with their worlds in both subtle and overt ways. For example an agent may install a python library or write a file to disk. This can lead to ordering effects that can impact future measurements. Consider, for example, comparing two agents on a common benchmark. One agent may appear more efficient than the other simply because it ran second, and benefitted from the hard work the first agent did in installing and debugging necessary Python libraries. To address this, AutoGenBench isolates each task in its own Docker container. This ensures that all runs start with the same initial conditions. (Docker is also a _much safer way to run agent-produced code_, in general.) -- **Instrumentation:** While top-line metrics are great for comparing agents or models, we often want much more information about how the agents are performing, where they are getting stuck, and how they can be improved. We may also later think of new research questions that require computing a different set of metrics. To this end, AutoGenBench is designed to log everything, and to compute metrics from those logs. This ensures that one can always go back to the logs to answer questions about what happened, run profiling software, or feed the logs into tools like [AgentEval](https://ag2ai.github.io/autogen/blog/2023/11/20/AgentEval). +- **Instrumentation:** While top-line metrics are great for comparing agents or models, we often want much more information about how the agents are performing, where they are getting stuck, and how they can be improved. We may also later think of new research questions that require computing a different set of metrics. To this end, AutoGenBench is designed to log everything, and to compute metrics from those logs. This ensures that one can always go back to the logs to answer questions about what happened, run profiling software, or feed the logs into tools like [AgentEval](https://ag2ai.github.io/ag2/blog/2023/11/20/AgentEval). ## Installing and Running AutoGenBench diff --git a/website/blog/2024-05-24-Agent/index.mdx b/website/blog/2024-05-24-Agent/index.mdx index 15c5c718ec..1662999763 100644 --- a/website/blog/2024-05-24-Agent/index.mdx +++ b/website/blog/2024-05-24-Agent/index.mdx @@ -143,7 +143,7 @@ better with low cost. [EcoAssistant](/blog/2023/11/09/EcoAssistant) is a good ex There are certainly tradeoffs to make. The large design space of multi-agents offers these tradeoffs and opens up new opportunities for optimization. -> Over a year since the debut of Ask AT&T, the generative AI platform to which we’ve onboarded over 80,000 users, AT&T has been enhancing its capabilities by incorporating 'AI Agents'. These agents, powered by the Autogen framework pioneered by Microsoft (https://ag2ai.github.io/autogen/blog/2023/12/01/AutoGenStudio/), are designed to tackle complicated workflows and tasks that traditional language models find challenging. To drive collaboration, AT&T is contributing back to the open-source project by introducing features that facilitate enhanced security and role-based access for various projects and data. +> Over a year since the debut of Ask AT&T, the generative AI platform to which we’ve onboarded over 80,000 users, AT&T has been enhancing its capabilities by incorporating 'AI Agents'. These agents, powered by the Autogen framework pioneered by Microsoft (https://ag2ai.github.io/ag2/blog/2023/12/01/AutoGenStudio/), are designed to tackle complicated workflows and tasks that traditional language models find challenging. To drive collaboration, AT&T is contributing back to the open-source project by introducing features that facilitate enhanced security and role-based access for various projects and data. > > > Andy Markus, Chief Data Officer at AT&T diff --git a/website/blog/2024-06-21-AgentEval/index.mdx b/website/blog/2024-06-21-AgentEval/index.mdx index 0801faaae2..e277096240 100644 --- a/website/blog/2024-06-21-AgentEval/index.mdx +++ b/website/blog/2024-06-21-AgentEval/index.mdx @@ -15,13 +15,13 @@ tags: [LLM, GPT, evaluation, task utility] TL;DR: * As a developer, how can you assess the utility and effectiveness of an LLM-powered application in helping end users with their tasks? -* To shed light on the question above, we previously introduced [`AgentEval`](https://ag2ai.github.io/autogen/blog/2023/11/20/AgentEval/) — a framework to assess the multi-dimensional utility of any LLM-powered application crafted to assist users in specific tasks. We have now embedded it as part of the AutoGen library to ease developer adoption. +* To shed light on the question above, we previously introduced [`AgentEval`](https://ag2ai.github.io/ag2/blog/2023/11/20/AgentEval/) — a framework to assess the multi-dimensional utility of any LLM-powered application crafted to assist users in specific tasks. We have now embedded it as part of the AutoGen library to ease developer adoption. * Here, we introduce an updated version of AgentEval that includes a verification process to estimate the robustness of the QuantifierAgent. More details can be found in [this paper](https://arxiv.org/abs/2405.02178). ## Introduction -Previously introduced [`AgentEval`](https://ag2ai.github.io/autogen/blog/2023/11/20/AgentEval/) is a comprehensive framework designed to bridge the gap in assessing the utility of LLM-powered applications. It leverages recent advancements in LLMs to offer a scalable and cost-effective alternative to traditional human evaluations. The framework comprises three main agents: `CriticAgent`, `QuantifierAgent`, and `VerifierAgent`, each playing a crucial role in assessing the task utility of an application. +Previously introduced [`AgentEval`](https://ag2ai.github.io/ag2/blog/2023/11/20/AgentEval/) is a comprehensive framework designed to bridge the gap in assessing the utility of LLM-powered applications. It leverages recent advancements in LLMs to offer a scalable and cost-effective alternative to traditional human evaluations. The framework comprises three main agents: `CriticAgent`, `QuantifierAgent`, and `VerifierAgent`, each playing a crucial role in assessing the task utility of an application. **CriticAgent: Defining the Criteria** diff --git a/website/docs/FAQ.mdx b/website/docs/FAQ.mdx index e588725289..5d6152bcc8 100644 --- a/website/docs/FAQ.mdx +++ b/website/docs/FAQ.mdx @@ -34,8 +34,8 @@ In version >=1, OpenAI renamed their `api_base` parameter to `base_url`. So for Yes. You currently have two options: -- Autogen can work with any API endpoint which complies with OpenAI-compatible RESTful APIs - e.g. serving local LLM via FastChat or LM Studio. Please check https://ag2ai.github.io/autogen/blog/2023/07/14/Local-LLMs for an example. -- You can supply your own custom model implementation and use it with Autogen. Please check https://ag2ai.github.io/autogen/blog/2024/01/26/Custom-Models for more information. +- Autogen can work with any API endpoint which complies with OpenAI-compatible RESTful APIs - e.g. serving local LLM via FastChat or LM Studio. Please check https://ag2ai.github.io/ag2/blog/2023/07/14/Local-LLMs for an example. +- You can supply your own custom model implementation and use it with Autogen. Please check https://ag2ai.github.io/ag2/blog/2024/01/26/Custom-Models for more information. ## Handle Rate Limit Error and Timeout Error diff --git a/website/docs/autogen-studio/getting-started.md b/website/docs/autogen-studio/getting-started.md index 1ca954bfc6..9476ae3311 100644 --- a/website/docs/autogen-studio/getting-started.md +++ b/website/docs/autogen-studio/getting-started.md @@ -5,7 +5,7 @@ ![ARA](./img/ara_stockprices.png) -AutoGen Studio is an low-code interface built to help you rapidly prototype AI agents, enhance them with skills, compose them into workflows and interact with them to accomplish tasks. It is built on top of the [AutoGen](https://ag2ai.github.io/autogen) framework, which is a toolkit for building AI agents. +AutoGen Studio is an low-code interface built to help you rapidly prototype AI agents, enhance them with skills, compose them into workflows and interact with them to accomplish tasks. It is built on top of the [AutoGen](https://ag2ai.github.io/ag2) framework, which is a toolkit for building AI agents. Code for AutoGen Studio is on GitHub at [build-with-ag2](https://github.com/ag2ai/build-with-ag2/tree/main/samples/apps/autogen-studio) @@ -113,4 +113,4 @@ If you are building a production application, please use the AutoGen framework a ## Acknowledgements -AutoGen Studio is Based on the [AutoGen](https://ag2ai.github.io/autogen) project. It was adapted from a research prototype built in October 2023 (original credits: Gagan Bansal, Adam Fourney, Victor Dibia, Piali Choudhury, Saleema Amershi, Ahmed Awadallah, Chi Wang). +AutoGen Studio is Based on the [AutoGen](https://ag2ai.github.io/ag2) project. It was adapted from a research prototype built in October 2023 (original credits: Gagan Bansal, Adam Fourney, Victor Dibia, Piali Choudhury, Saleema Amershi, Ahmed Awadallah, Chi Wang). diff --git a/website/docs/topics/non-openai-models/about-using-nonopenai-models.md b/website/docs/topics/non-openai-models/about-using-nonopenai-models.md index 9ca768d5e7..41134a5224 100644 --- a/website/docs/topics/non-openai-models/about-using-nonopenai-models.md +++ b/website/docs/topics/non-openai-models/about-using-nonopenai-models.md @@ -1,7 +1,7 @@ # Non-OpenAI Models AutoGen allows you to use non-OpenAI models through proxy servers that provide -an OpenAI-compatible API or a [custom model client](https://ag2ai.github.io/autogen/blog/2024/01/26/Custom-Models) +an OpenAI-compatible API or a [custom model client](https://ag2ai.github.io/ag2/blog/2024/01/26/Custom-Models) class. Benefits of this flexibility include access to hundreds of models, assigning specialized