diff --git a/README.md b/README.md index 2e3adbd46..d60a110ae 100644 --- a/README.md +++ b/README.md @@ -34,28 +34,35 @@ Run example in Collab: gpt4 text -> dalle3 img -> gpt4vision img + text analyze img -> dalle3 img -> loop + +""" +from swarms.models.gpt4_vision_api import GPT4VisionAPI diff --git a/playground/demos/swarm_of_mma_manufacturing/main.py b/playground/demos/swarm_of_mma_manufacturing/main.py new file mode 100644 index 000000000..ebb007682 --- /dev/null +++ b/playground/demos/swarm_of_mma_manufacturing/main.py @@ -0,0 +1,15 @@ +""" +Swarm of multi modal autonomous agents for manufacturing! +--------------------------------------------------------- +Health Security agent: Agent that monitors the health of working conditions: input image of factory output: health safety index 0.0 - 1.0 being the highest +Quality Control agent: Agent that monitors the quality of the product: input image of product output: quality index 0.0 - 1.0 being the highest +Productivity agent: Agent that monitors the productivity of the factory: input image of factory output: productivity index 0.0 - 1.0 being the highest +Safety agent: Agent that monitors the safety of the factory: input image of factory output: safety index 0.0 - 1.0 being the highest +Security agent: Agent that monitors the security of the factory: input image of factory output: security index 0.0 - 1.0 being the highest +Sustainability agent: Agent that monitors the sustainability of the factory: input image of factory output: sustainability index 0.0 - 1.0 being the highest +Efficiency agent: Agent that monitors the efficiency of the factory: input image of factory output: efficiency index 0.0 - 1.0 being the highest + + +Flow: +health security agent -> quality control agent -> productivity agent -> safety agent -> security agent -> sustainability agent -> efficiency agent +""" diff --git a/pyproject.toml b/pyproject.toml index dcad7e7e6..0b5a1930a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "swarms" -version = "2.4.1" +version = "2.4.2" description = "Swarms - Pytorch" license = "MIT" authors = ["Kye Gomez "] diff --git a/swarms/models/base_multimodal_model.py b/swarms/models/base_multimodal_model.py index 9f451be06..e5671917d 100644 --- a/swarms/models/base_multimodal_model.py +++ b/swarms/models/base_multimodal_model.py @@ -1,3 +1,4 @@ +from abc import abstractmethod import asyncio import base64 import concurrent.futures @@ -7,8 +8,8 @@ from typing import List, Optional, Tuple import requests -from ABC import abstractmethod from PIL import Image +from termcolor import colored class BaseMultiModalModel: @@ -37,7 +38,6 @@ def __init__( self.retries = retries self.chat_history = [] - @abstractmethod def __call__(self, text: str, img: str): """Run the model""" @@ -61,17 +61,17 @@ def get_img_from_web(self, img: str): except requests.RequestException as error: print(f"Error fetching image from {img} and error: {error}") return None - + def encode_img(self, img: str): """Encode the image to base64""" with open(img, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") - + def get_img(self, img: str): """Get the image from the path""" image_pil = Image.open(img) return image_pil - + def clear_chat_history(self): """Clear the chat history""" self.chat_history = [] @@ -87,11 +87,11 @@ def run_many( Args: tasks (List[str]): List of tasks imgs (List[str]): List of image paths - + Returns: List[str]: List of responses - - + + """ # Instantiate the thread pool executor with ThreadPoolExecutor(max_workers=self.max_workers) as executor: @@ -101,7 +101,6 @@ def run_many( for result in results: print(result) - def run_batch(self, tasks_images: List[Tuple[str, str]]) -> List[str]: """Process a batch of tasks and images""" with concurrent.futures.ThreadPoolExecutor() as executor: @@ -133,11 +132,11 @@ async def run_batch_async_with_retries( for task, img in tasks_images ] return await asyncio.gather(*futures) - + def unique_chat_history(self): """Get the unique chat history""" return list(set(self.chat_history)) - + def run_with_retries(self, task: str, img: str): """Run the model with retries""" for i in range(self.retries): @@ -146,7 +145,7 @@ def run_with_retries(self, task: str, img: str): except Exception as error: print(f"Error with the request {error}") continue - + def run_batch_with_retries(self, tasks_images: List[Tuple[str, str]]): """Run the model with retries""" for i in range(self.retries): @@ -188,28 +187,37 @@ def get_generation_time(self) -> float: if self.start_time and self.end_time: return self.end_time - self.start_time return 0 - + def get_chat_history(self): """Get the chat history""" return self.chat_history - + def get_unique_chat_history(self): """Get the unique chat history""" return list(set(self.chat_history)) - + def get_chat_history_length(self): """Get the chat history length""" return len(self.chat_history) - + def get_unique_chat_history_length(self): """Get the unique chat history length""" return len(list(set(self.chat_history))) - + def get_chat_history_tokens(self): """Get the chat history tokens""" return self._num_tokens() - + def print_beautiful(self, content: str, color: str = "cyan"): """Print Beautifully with termcolor""" content = colored(content, color) - print(content) \ No newline at end of file + print(content) + + def stream(self, content: str): + """Stream the output + + Args: + content (str): _description_ + """ + for chunk in content: + print(chunk) diff --git a/swarms/models/gpt4_vision_api.py b/swarms/models/gpt4_vision_api.py index 6a8b8eb8c..869cde1aa 100644 --- a/swarms/models/gpt4_vision_api.py +++ b/swarms/models/gpt4_vision_api.py @@ -1,6 +1,7 @@ -import logging +import logging import asyncio import base64 +from typing import Optional import concurrent.futures from termcolor import colored import json @@ -12,6 +13,13 @@ import requests from dotenv import load_dotenv + +try: + import cv2 +except ImportError: + print("OpenCV not installed. Please install OpenCV to use this model.") + raise ImportError + # Load environment variables load_dotenv() openai_api_key = os.getenv("OPENAI_API_KEY") @@ -59,7 +67,8 @@ def __init__( max_workers: int = 10, max_tokens: str = 300, openai_proxy: str = "https://api.openai.com/v1/chat/completions", - beautify: bool = False + beautify: bool = False, + streaming_enabled: Optional[bool] = False, ): super().__init__() self.openai_api_key = openai_api_key @@ -69,6 +78,7 @@ def __init__( self.max_tokens = max_tokens self.openai_proxy = openai_proxy self.beautify = beautify + self.streaming_enabled = streaming_enabled if self.logging_enabled: logging.basicConfig(level=logging.DEBUG) @@ -123,14 +133,101 @@ def run(self, task: str, img: str): out = response.json() content = out["choices"][0]["message"]["content"] + if self.streaming_enabled: + content = self.stream_response(content) + else: + pass + if self.beautify: content = colored(content, "cyan") + print(content) else: print(content) + except Exception as error: print(f"Error with the request: {error}") raise error + def video_prompt(self, frames): + """ + SystemPrompt is a class that generates a prompt for the user to respond to. + The prompt is generated based on the current state of the system. + + Parameters + ---------- + frames : list + A list of base64 frames + + Returns + ------- + PROMPT : str + The system prompt + + Examples + -------- + + >>> from swarms.models import GPT4VisionAPI + >>> llm = GPT4VisionAPI() + >>> video = "video.mp4" + >>> base64_frames = llm.process_video(video) + >>> prompt = llm.video_prompt(base64_frames) + >>> print(prompt) + + """ + PROMPT = f""" + These are frames from a video that I want to upload. Generate a compelling description that I can upload along with the video: + + {frames} + """ + return PROMPT + + def stream_response(self, content: str): + """Stream the response of the output + + Args: + content (str): _description_ + """ + for chunk in content: + print(chunk) + + def process_video(self, video: str): + """ + Process a video into a list of base64 frames + + Parameters + ---------- + video : str + The path to the video file + + Returns + ------- + base64_frames : list + A list of base64 frames + + Examples + -------- + >>> from swarms.models import GPT4VisionAPI + >>> llm = GPT4VisionAPI() + >>> video = "video.mp4" + >>> base64_frames = llm.process_video(video) + + """ + video = cv2.VideoCapture(video) + + base64_frames = [] + while video.isOpened(): + success, frame = video.read() + if not success: + break + _, buffer = cv2.imencode(".jpg", frame) + base64_frames.append(base64.b64encode(buffer).decode("utf-8")) + + video.release() + print(len(base64_frames), "frames read.") + + for img in base64_frames: + base64.b64decode(img.encode("utf-8")) + def __call__(self, task: str, img: str): """Run the model.""" try: @@ -168,10 +265,17 @@ def __call__(self, task: str, img: str): out = response.json() content = out["choices"][0]["message"]["content"] + if self.streaming_enabled: + content = self.stream_response(content) + else: + pass + if self.beautify: content = colored(content, "cyan") + print(content) else: print(content) + except Exception as error: print(f"Error with the request: {error}") raise error diff --git a/swarms/models/kosmos_two.py b/swarms/models/kosmos_two.py index 7e9da590a..99998287f 100644 --- a/swarms/models/kosmos_two.py +++ b/swarms/models/kosmos_two.py @@ -24,7 +24,7 @@ class Kosmos: ---------- model_name : str Path to the pretrained model - + Examples -------- >>> kosmos = Kosmos() diff --git a/swarms/models/whisperx_model.py b/swarms/models/whisperx_model.py index 883c3edbc..338db6e3a 100644 --- a/swarms/models/whisperx_model.py +++ b/swarms/models/whisperx_model.py @@ -99,7 +99,9 @@ def transcribe_youtube_video(self): print("The key 'segments' is not found in the result.") def transcribe(self, audio_file): - model = whisperx_model.load_model("large-v2", self.device, self.compute_type) + model = whisperx_model.load_model( + "large-v2", self.device, self.compute_type + ) audio = whisperx_model.load_audio(audio_file) result = model.transcribe(audio, batch_size=self.batch_size) diff --git a/swarms/structs/flow.py b/swarms/structs/flow.py index 47740f73f..e00791992 100644 --- a/swarms/structs/flow.py +++ b/swarms/structs/flow.py @@ -498,7 +498,7 @@ def activate_autonomous_agent(self): ) print(error) - def run(self, task: str, img: Optional[str], **kwargs): + def run(self, task: Optional[str], img: Optional[str] = None, **kwargs): """ Run the autonomous agent loop @@ -528,7 +528,11 @@ def run(self, task: str, img: Optional[str], **kwargs): self.print_dashboard(task) loop_count = 0 + + # While the max_loops is auto or the loop count is less than the max_loops while self.max_loops == "auto" or loop_count < self.max_loops: + + # Loop count loop_count += 1 print( colored(f"\nLoop {loop_count} of {self.max_loops}", "blue") diff --git a/swarms/utils/disable_logging.py b/swarms/utils/disable_logging.py index 5b6ec6754..d1c7df9b8 100644 --- a/swarms/utils/disable_logging.py +++ b/swarms/utils/disable_logging.py @@ -1,15 +1,14 @@ - import logging import os import warnings + def disable_logging(): warnings.filterwarnings("ignore", category=UserWarning) # disable tensorflow warnings os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" - # Set the logging level for the entire module logging.basicConfig(level=logging.WARNING) @@ -20,6 +19,12 @@ def disable_logging(): except Exception as error: print(f"Pytorch logging not disabled: {error}") - for logger_name in ['tensorflow', 'h5py', 'numexpr', 'git', 'wandb.docker.auth']: + for logger_name in [ + "tensorflow", + "h5py", + "numexpr", + "git", + "wandb.docker.auth", + ]: logger = logging.getLogger(logger_name) - logger.setLevel(logging.WARNING) # Supress DEBUG and info logs + logger.setLevel(logging.WARNING) # Supress DEBUG and info logs