autogenhub · sonichi · Sep 13, 2024 · Sep 2, 2024 · Sep 2, 2024 · Sep 5, 2024
diff --git a/autogen/oai/bedrock.py b/autogen/oai/bedrock.py
@@ -204,13 +204,7 @@ def create(self, params):
         if len(tool_config["tools"]) > 0:
             request_args["toolConfig"] = tool_config
 
-        try:
-            response = self.bedrock_runtime.converse(
-                **request_args,
-            )
-        except Exception as e:
-            raise RuntimeError(f"Failed to get response from Bedrock: {e}")
-
+        response = self.bedrock_runtime.converse(**request_args)
         if response is None:
             raise RuntimeError(f"Failed to get response from Bedrock after retrying {self._retries} times.")
 

diff --git a/autogen/oai/client.py b/autogen/oai/client.py
@@ -49,59 +49,106 @@
     ERROR = None
 
 try:
+    from google.api_core.exceptions import (  # noqa
+        InternalServerError as gemini_InternalServerError,
+        ResourceExhausted as gemini_ResourceExhausted,
+    )
+
     from autogen.oai.gemini import GeminiClient
 
     gemini_import_exception: Optional[ImportError] = None
 except ImportError as e:
+    gemini_InternalServerError = gemini_ResourceExhausted = Exception
     gemini_import_exception = e
 
 try:
+    from anthropic import (  # noqa
+        InternalServerError as anthorpic_InternalServerError,
+        RateLimitError as anthorpic_RateLimitError,
+    )
+
     from autogen.oai.anthropic import AnthropicClient
 
     anthropic_import_exception: Optional[ImportError] = None
 except ImportError as e:
+    anthorpic_InternalServerError = anthorpic_RateLimitError = Exception
     anthropic_import_exception = e
 
 try:
+    from mistralai.models import (  # noqa
+        HTTPValidationError as mistral_HTTPValidationError,
+        SDKError as mistral_SDKError,
+    )
+
     from autogen.oai.mistral import MistralAIClient
 
     mistral_import_exception: Optional[ImportError] = None
 except ImportError as e:
+    mistral_SDKError = mistral_HTTPValidationError = Exception
     mistral_import_exception = e
 
 try:
+    from together.error import TogetherException as together_TogetherException
+
     from autogen.oai.together import TogetherClient
 
     together_import_exception: Optional[ImportError] = None
 except ImportError as e:
+    together_TogetherException = Exception
     together_import_exception = e
 
 try:
+    from groq import (  # noqa
+        APIConnectionError as groq_APIConnectionError,
+        InternalServerError as groq_InternalServerError,
+        RateLimitError as groq_RateLimitError,
+    )
+
     from autogen.oai.groq import GroqClient
 
     groq_import_exception: Optional[ImportError] = None
 except ImportError as e:
+    groq_InternalServerError = groq_RateLimitError = groq_APIConnectionError = Exception
     groq_import_exception = e
 
 try:
+    from cohere.errors import (  # noqa
+        InternalServerError as cohere_InternalServerError,
+        ServiceUnavailableError as cohere_ServiceUnavailableError,
+        TooManyRequestsError as cohere_TooManyRequestsError,
+    )
+
     from autogen.oai.cohere import CohereClient
 
     cohere_import_exception: Optional[ImportError] = None
 except ImportError as e:
+    cohere_InternalServerError = cohere_TooManyRequestsError = cohere_ServiceUnavailableError = Exception
     cohere_import_exception = e
 
 try:
+    from ollama import (  # noqa
+        RequestError as ollama_RequestError,
+        ResponseError as ollama_ResponseError,
+    )
+
     from autogen.oai.ollama import OllamaClient
 
     ollama_import_exception: Optional[ImportError] = None
 except ImportError as e:
+    ollama_RequestError = ollama_ResponseError = Exception
     ollama_import_exception = e
 
 try:
+    from botocore.exceptions import (  # noqa
+        BotoCoreError as bedrock_BotoCoreError,
+        ClientError as bedrock_ClientError,
+    )
+
     from autogen.oai.bedrock import BedrockClient
 
     bedrock_import_exception: Optional[ImportError] = None
 except ImportError as e:
+    bedrock_BotoCoreError = bedrock_ClientError = Exception
     bedrock_import_exception = e
 
 logger = logging.getLogger(__name__)
@@ -544,7 +591,7 @@ def _register_default_client(self, config: Dict[str, Any], openai_config: Dict[s
                 self._clients.append(client)
             elif api_type is not None and api_type.startswith("ollama"):
                 if ollama_import_exception:
-                    raise ImportError("Please install `ollama` to use the Ollama API.")
+                    raise ImportError("Please install `ollama` and `fix-busted-json` to use the Ollama API.")
                 client = OllamaClient(**openai_config)
                 self._clients.append(client)
             elif api_type is not None and api_type.startswith("bedrock"):
@@ -791,6 +838,28 @@ def yes_or_no_filter(context, response):
                 logger.debug(f"config {i} failed", exc_info=True)
                 if i == last:
                     raise
+            except (
+                gemini_InternalServerError,
+                gemini_ResourceExhausted,
+                anthorpic_InternalServerError,
+                anthorpic_RateLimitError,
+                mistral_SDKError,
+                mistral_HTTPValidationError,
+                together_TogetherException,
+                groq_InternalServerError,
+                groq_RateLimitError,
+                groq_APIConnectionError,
+                cohere_InternalServerError,
+                cohere_TooManyRequestsError,
+                cohere_ServiceUnavailableError,
+                ollama_RequestError,
+                ollama_ResponseError,
+                bedrock_BotoCoreError,
+                bedrock_ClientError,
+            ):
+                logger.debug(f"config {i} failed", exc_info=True)
+                if i == last:
+                    raise
             else:
                 # add cost calculation before caching no matter filter is passed or not
                 if price is not None:

diff --git a/autogen/oai/cohere.py b/autogen/oai/cohere.py
@@ -172,82 +172,22 @@ def create(self, params: Dict) -> ChatCompletion:
 
         # Stream if in parameters
         streaming = True if "stream" in params and params["stream"] else False
-        cohere_finish = ""
-
-        max_retries = 5
-        for attempt in range(max_retries):
-            ans = None
-            try:
-                if streaming:
-                    response = client.chat_stream(**cohere_params)
-                else:
-                    response = client.chat(**cohere_params)
-            except CohereRateLimitError as e:
-                raise RuntimeError(f"Cohere exception occurred: {e}")
-            else:
-
-                if streaming:
-                    # Streaming...
-                    ans = ""
-                    for event in response:
-                        if event.event_type == "text-generation":
-                            ans = ans + event.text
-                        elif event.event_type == "tool-calls-generation":
-                            # When streaming, tool calls are compiled at the end into a single event_type
-                            ans = event.text
-                            cohere_finish = "tool_calls"
-                            tool_calls = []
-                            for tool_call in event.tool_calls:
-                                tool_calls.append(
-                                    ChatCompletionMessageToolCall(
-                                        id=str(random.randint(0, 100000)),
-                                        function={
-                                            "name": tool_call.name,
-                                            "arguments": (
-                                                "" if tool_call.parameters is None else json.dumps(tool_call.parameters)
-                                            ),
-                                        },
-                                        type="function",
-                                    )
-                                )
-
-                    # Not using billed_units, but that may be better for cost purposes
-                    prompt_tokens = event.response.meta.tokens.input_tokens
-                    completion_tokens = event.response.meta.tokens.output_tokens
-                    total_tokens = prompt_tokens + completion_tokens
-
-                    response_id = event.response.response_id
-                else:
-                    # Non-streaming finished
-                    ans: str = response.text
-
-                    # Not using billed_units, but that may be better for cost purposes
-                    prompt_tokens = response.meta.tokens.input_tokens
-                    completion_tokens = response.meta.tokens.output_tokens
-                    total_tokens = prompt_tokens + completion_tokens
-
-                    response_id = response.response_id
-                break
-
-        if response is not None:
-
-            response_content = ans
-
-            if streaming:
-                # Streaming response
-                if cohere_finish == "":
-                    cohere_finish = "stop"
-                    tool_calls = None
-            else:
-                # Non-streaming response
-                # If we have tool calls as the response, populate completed tool calls for our return OAI response
-                if response.tool_calls is not None:
+        cohere_finish = "stop"
+        tool_calls = None
+        ans = None
+        if streaming:
+            response = client.chat_stream(**cohere_params)
+            # Streaming...
+            ans = ""
+            for event in response:
+                if event.event_type == "text-generation":
+                    ans = ans + event.text
+                elif event.event_type == "tool-calls-generation":
+                    # When streaming, tool calls are compiled at the end into a single event_type
+                    ans = event.text
                     cohere_finish = "tool_calls"
                     tool_calls = []
-                    for tool_call in response.tool_calls:
-
-                        # if parameters are null, clear them out (Cohere can return a string "null" if no parameter values)
-
+                    for tool_call in event.tool_calls:
                         tool_calls.append(
                             ChatCompletionMessageToolCall(
                                 id=str(random.randint(0, 100000)),
@@ -260,16 +200,45 @@ def create(self, params: Dict) -> ChatCompletion:
                                 type="function",
                             )
                         )
-                else:
-                    cohere_finish = "stop"
-                    tool_calls = None
+
+            # Not using billed_units, but that may be better for cost purposes
+            prompt_tokens = event.response.meta.tokens.input_tokens
+            completion_tokens = event.response.meta.tokens.output_tokens
+            total_tokens = prompt_tokens + completion_tokens
+            response_id = event.response.response_id
         else:
-            raise RuntimeError(f"Failed to get response from Cohere after retrying {attempt + 1} times.")
+            response = client.chat(**cohere_params)
+            ans: str = response.text
+
+            # Not using billed_units, but that may be better for cost purposes
+            prompt_tokens = response.meta.tokens.input_tokens
+            completion_tokens = response.meta.tokens.output_tokens
+            total_tokens = prompt_tokens + completion_tokens
+
+            response_id = response.response_id
+            # If we have tool calls as the response, populate completed tool calls for our return OAI response
+            if response.tool_calls is not None:
+                cohere_finish = "tool_calls"
+                tool_calls = []
+                for tool_call in response.tool_calls:
+
+                    # if parameters are null, clear them out (Cohere can return a string "null" if no parameter values)
+
+                    tool_calls.append(
+                        ChatCompletionMessageToolCall(
+                            id=str(random.randint(0, 100000)),
+                            function={
+                                "name": tool_call.name,
+                                "arguments": ("" if tool_call.parameters is None else json.dumps(tool_call.parameters)),
+                            },
+                            type="function",
+                        )
+                    )
 
         # 3. convert output
         message = ChatCompletionMessage(
             role="assistant",
-            content=response_content,
+            content=ans,
             function_call=None,
             tool_calls=tool_calls,
         )

diff --git a/autogen/oai/gemini.py b/autogen/oai/gemini.py
@@ -51,7 +51,6 @@
 import requests
 import vertexai
 from google.ai.generativelanguage import Content, Part
-from google.api_core.exceptions import InternalServerError
 from google.auth.credentials import Credentials
 from openai.types.chat import ChatCompletion
 from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
@@ -222,30 +221,9 @@ def create(self, params: Dict) -> ChatCompletion:
                 )
                 genai.configure(api_key=self.api_key)
                 chat = model.start_chat(history=gemini_messages[:-1])
-            max_retries = 5
-            for attempt in range(max_retries):
-                ans = None
-                try:
-                    response = chat.send_message(
-                        gemini_messages[-1].parts, stream=stream, safety_settings=safety_settings
-                    )
-                except InternalServerError:
-                    delay = 5 * (2**attempt)
-                    warnings.warn(
-                        f"InternalServerError `500` occurs when calling Gemini's chat model. Retry in {delay} seconds...",
-                        UserWarning,
-                    )
-                    time.sleep(delay)
-                except Exception as e:
-                    raise RuntimeError(f"Google GenAI exception occurred while calling Gemini API: {e}")
-                else:
-                    # `ans = response.text` is unstable. Use the following code instead.
-                    ans: str = chat.history[-1].parts[0].text
-                    break
-
-            if ans is None:
-                raise RuntimeError(f"Fail to get response from Google AI after retrying {attempt + 1} times.")
 
+            response = chat.send_message(gemini_messages[-1].parts, stream=stream, safety_settings=safety_settings)
+            ans: str = chat.history[-1].parts[0].text
             prompt_tokens = model.count_tokens(chat.history[:-1]).total_tokens
             completion_tokens = model.count_tokens(ans).total_tokens
         elif model_name == "gemini-pro-vision":