diff --git a/autogen/agentchat/contrib/img_utils.py b/autogen/agentchat/contrib/img_utils.py index 51806a024bc3..3e7b509bb3fc 100644 --- a/autogen/agentchat/contrib/img_utils.py +++ b/autogen/agentchat/contrib/img_utils.py @@ -307,45 +307,68 @@ def message_formatter_pil_to_b64(messages: List[Dict]) -> List[Dict]: return new_messages -def num_tokens_from_gpt_image(image_data: Union[str, Image.Image]) -> int: +def num_tokens_from_gpt_image( + image_data: Union[str, Image.Image], model: str = "gpt-4-vision", low_quality: bool = False +) -> int: """ - Calculate the number of tokens required to process an image based on its dimensions after scaling. - This function scales the image so that its longest edge is at most 2048 pixels and its shortest edge - is at most 768 pixels. It then calculates the number of 512x512 tiles needed to cover the scaled - image and computes the total tokens based on the number of these tiles. - See more official details at: - - https://openai.com/pricing - - https://platform.openai.com/docs/guides/vision - See community discussion of OpenAI at: - - https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/ + Calculate the number of tokens required to process an image based on its dimensions + after scaling for different GPT models. Supports "gpt-4-vision", "gpt-4o", and "gpt-4o-mini". + This function scales the image so that its longest edge is at most 2048 pixels and its shortest + edge is at most 768 pixels (for "gpt-4-vision"). It then calculates the number of 512x512 tiles + needed to cover the scaled image and computes the total tokens based on the number of these tiles. + + Reference: https://openai.com/api/pricing/ + Args: image_data : Union[str, Image.Image]: The image data which can either be a base64 encoded string, a URL, a file path, or a PIL Image object. + model: str: The model being used for image processing. Can be "gpt-4-vision", "gpt-4o", or "gpt-4o-mini". + Returns: int: The total number of tokens required for processing the image. - Examples + + Examples: -------- >>> from PIL import Image >>> img = Image.new('RGB', (2500, 2500), color = 'red') - >>> num_tokens_from_gpt_image(img) + >>> num_tokens_from_gpt_image(img, model="gpt-4-vision") 765 """ + image = get_pil_image(image_data) # PIL Image width, height = image.size - # 1. Constrain the longest edge to 2048 pixels - if max(width, height) > 2048: - scale_factor = 2048.0 / max(width, height) + # Scaling factors and tile sizes may differ depending on the model + if "gpt-4-vision" in model or "gpt-4-turbo" in model or "gpt-4v" in model or "gpt-4-v" in model: + max_edge, min_edge, tile_size = 2048, 768, 512 + base_token_count, token_multiplier = 85, 170 + elif "gpt-4o-mini" in model: + max_edge, min_edge, tile_size = 2048, 768, 512 + base_token_count, token_multiplier = 2833, 5667 + elif "gpt-4o" in model: + max_edge, min_edge, tile_size = 2048, 768, 512 + base_token_count, token_multiplier = 85, 170 + else: + raise ValueError( + f"Model {model} is not supported. Choose 'gpt-4-vision', 'gpt-4-turbo', 'gpt-4v', 'gpt-4-v', 'gpt-4o', or 'gpt-4o-mini'." + ) + + if low_quality: + return base_token_count + + # 1. Constrain the longest edge + if max(width, height) > max_edge: + scale_factor = max_edge / max(width, height) width, height = int(width * scale_factor), int(height * scale_factor) - # 2. Further constrain the shortest edge to 768 pixels - if min(width, height) > 768: - scale_factor = 768.0 / min(width, height) + # 2. Further constrain the shortest edge + if min(width, height) > min_edge: + scale_factor = min_edge / min(width, height) width, height = int(width * scale_factor), int(height * scale_factor) # 3. Count how many tiles are needed to cover the image - tiles_width = ceil(width / 512) - tiles_height = ceil(height / 512) - total_tokens = 85 + 170 * (tiles_width * tiles_height) + tiles_width = ceil(width / tile_size) + tiles_height = ceil(height / tile_size) + total_tokens = base_token_count + token_multiplier * (tiles_width * tiles_height) return total_tokens diff --git a/autogen/token_count_utils.py b/autogen/token_count_utils.py index fa9d9f9f8d4b..386c8754cf23 100644 --- a/autogen/token_count_utils.py +++ b/autogen/token_count_utils.py @@ -17,13 +17,14 @@ img_util_imported = True except ImportError: - def num_tokens_from_gpt_image(_): + def num_tokens_from_gpt_image(*args, **kwargs): return 0 img_util_imported = False logger = logging.getLogger(__name__) +logger.img_dependency_warned = False # member variable to track if the warning has been logged def get_max_token_limit(model: str = "gpt-3.5-turbo-0613") -> int: @@ -125,6 +126,13 @@ def _num_token_from_messages(messages: Union[List, Dict], model="gpt-3.5-turbo-0 "gpt-4-32k-0314", "gpt-4-0613", "gpt-4-32k-0613", + "gpt-4-turbo-preview", + "gpt-4-vision-preview", + "gpt-4o", + "gpt-4o-2024-05-13", + "gpt-4o-2024-08-06", + "gpt-4o-mini", + "gpt-4o-mini-2024-07-18", }: tokens_per_message = 3 tokens_per_name = 1 @@ -166,12 +174,19 @@ def _num_token_from_messages(messages: Union[List, Dict], model="gpt-3.5-turbo-0 num_tokens += len(encoding.encode(part["text"])) if "image_url" in part: assert "url" in part["image_url"] - if not img_util_imported: + if not img_util_imported and not logger.img_dependency_warned: logger.warning( "img_utils or PIL not imported. Skipping image token count." "Please install autogen with [lmm] option.", ) - num_tokens += num_tokens_from_gpt_image(part["image_url"]["url"]) + logger.img_dependency_warned = True + is_low_quality = "detail" in part["image_url"] and part["image_url"]["detail"] == "low" + try: + num_tokens += num_tokens_from_gpt_image( + image_data=part["image_url"]["url"], model=model, low_quality=is_low_quality + ) + except ValueError as e: + logger.warning(f"Error in num_tokens_from_gpt_image: {e}") continue # function calls diff --git a/test/agentchat/contrib/test_img_utils.py b/test/agentchat/contrib/test_img_utils.py index 6de13e19fda4..4e575b7306fc 100755 --- a/test/agentchat/contrib/test_img_utils.py +++ b/test/agentchat/contrib/test_img_utils.py @@ -299,20 +299,36 @@ def test_formatting(self): class ImageTokenCountTest(unittest.TestCase): def test_tokens(self): + # Note: Ground Truth manually fetched from https://openai.com/api/pricing/ in 2024/10/05 small_image = Image.new("RGB", (10, 10), color="red") self.assertEqual(num_tokens_from_gpt_image(small_image), 85 + 170) + self.assertEqual(num_tokens_from_gpt_image(small_image, "gpt-4o"), 255) + self.assertEqual(num_tokens_from_gpt_image(small_image, "gpt-4o-mini"), 8500) med_image = Image.new("RGB", (512, 1025), color="red") self.assertEqual(num_tokens_from_gpt_image(med_image), 85 + 170 * 1 * 3) + self.assertEqual(num_tokens_from_gpt_image(med_image, "gpt-4o"), 595) + self.assertEqual(num_tokens_from_gpt_image(med_image, "gpt-4o-mini"), 19834) tall_image = Image.new("RGB", (10, 1025), color="red") self.assertEqual(num_tokens_from_gpt_image(tall_image), 85 + 170 * 1 * 3) + self.assertEqual(num_tokens_from_gpt_image(tall_image, "gpt-4o"), 595) + self.assertEqual(num_tokens_from_gpt_image(tall_image, "gpt-4o-mini"), 19834) huge_image = Image.new("RGB", (10000, 10000), color="red") self.assertEqual(num_tokens_from_gpt_image(huge_image), 85 + 170 * 2 * 2) + self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4o"), 765) + self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4o-mini"), 25501) huge_wide_image = Image.new("RGB", (10000, 5000), color="red") self.assertEqual(num_tokens_from_gpt_image(huge_wide_image), 85 + 170 * 3 * 2) + self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o"), 1105) + self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o-mini"), 36835) + + # Handle low quality + self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4-vision", low_quality=True), 85) + self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o", low_quality=True), 85) + self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o-mini", low_quality=True), 2833) if __name__ == "__main__": diff --git a/test/test_token_count.py b/test/test_token_count.py index 9160c6c966c6..a770757e7532 100755 --- a/test/test_token_count.py +++ b/test/test_token_count.py @@ -97,6 +97,23 @@ def test_num_tokens_from_gpt_image(): # the image token is: 85 + 170 = 255 assert tokens == 34 + 255 + # Test low quality + messages = [ + { + "role": "system", + "content": "you are a helpful assistant. af3758 *3 33(3)", + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "hello asdfjj qeweee"}, + {"type": "image_url", "image_url": {"url": base64_encoded_image, "detail": "low"}}, + ], + }, + ] + tokens = count_token(messages, model="gpt-4o") + assert tokens == 34 + 85 + def test_count_token(): messages = [