Image tokens calculation: add for GPT-4v, 4o, 4o-mini (#57)

* gpt-4v image token count * Image token count for gpt-4o and gpt-4o-mini * Img token configs into a global dict --------- Co-authored-by: HRUSHIKESH DOKALA <[email protected]> Co-authored-by: Qingyun Wu <[email protected]>
autogenhub · Oct 20, 2024 · a81a6a7 · a81a6a7
1 parent 5615d51
commit a81a6a7
Show file tree

Hide file tree

Showing 4 changed files with 217 additions and 0 deletions.
diff --git a/autogen/agentchat/contrib/img_utils.py b/autogen/agentchat/contrib/img_utils.py
@@ -9,13 +9,33 @@
 import os
 import re
 from io import BytesIO
+from math import ceil
 from typing import Dict, List, Tuple, Union
 
 import requests
 from PIL import Image
 
 from autogen.agentchat import utils
 
+# Parameters for token counting for images for different models
+MODEL_PARAMS = {
+    "gpt-4-vision": {
+        "max_edge": 2048,
+        "min_edge": 768,
+        "tile_size": 512,
+        "base_token_count": 85,
+        "token_multiplier": 170,
+    },
+    "gpt-4o-mini": {
+        "max_edge": 2048,
+        "min_edge": 768,
+        "tile_size": 512,
+        "base_token_count": 2833,
+        "token_multiplier": 5667,
+    },
+    "gpt-4o": {"max_edge": 2048, "min_edge": 768, "tile_size": 512, "base_token_count": 85, "token_multiplier": 170},
+}
+
 
 def get_pil_image(image_file: Union[str, Image.Image]) -> Image.Image:
     """
@@ -304,3 +324,67 @@ def message_formatter_pil_to_b64(messages: List[Dict]) -> List[Dict]:
         new_messages.append(message)
 
     return new_messages
+
+
+def num_tokens_from_gpt_image(
+    image_data: Union[str, Image.Image], model: str = "gpt-4-vision", low_quality: bool = False
+) -> int:
+    """
+    Calculate the number of tokens required to process an image based on its dimensions
+    after scaling for different GPT models. Supports "gpt-4-vision", "gpt-4o", and "gpt-4o-mini".
+    This function scales the image so that its longest edge is at most 2048 pixels and its shortest
+    edge is at most 768 pixels (for "gpt-4-vision"). It then calculates the number of 512x512 tiles
+    needed to cover the scaled image and computes the total tokens based on the number of these tiles.
+
+    Reference: https://openai.com/api/pricing/
+
+    Args:
+        image_data : Union[str, Image.Image]: The image data which can either be a base64
+           encoded string, a URL, a file path, or a PIL Image object.
+        model: str: The model being used for image processing. Can be "gpt-4-vision", "gpt-4o", or "gpt-4o-mini".
+
+    Returns:
+        int: The total number of tokens required for processing the image.
+
+    Examples:
+    --------
+    >>> from PIL import Image
+    >>> img = Image.new('RGB', (2500, 2500), color = 'red')
+    >>> num_tokens_from_gpt_image(img, model="gpt-4-vision")
+    765
+    """
+
+    image = get_pil_image(image_data)  # PIL Image
+    width, height = image.size
+
+    # Determine model parameters
+    if "gpt-4-vision" in model or "gpt-4-turbo" in model or "gpt-4v" in model or "gpt-4-v" in model:
+        params = MODEL_PARAMS["gpt-4-vision"]
+    elif "gpt-4o-mini" in model:
+        params = MODEL_PARAMS["gpt-4o-mini"]
+    elif "gpt-4o" in model:
+        params = MODEL_PARAMS["gpt-4o"]
+    else:
+        raise ValueError(
+            f"Model {model} is not supported. Choose 'gpt-4-vision', 'gpt-4-turbo', 'gpt-4v', 'gpt-4-v', 'gpt-4o', or 'gpt-4o-mini'."
+        )
+
+    if low_quality:
+        return params["base_token_count"]
+
+    # 1. Constrain the longest edge
+    if max(width, height) > params["max_edge"]:
+        scale_factor = params["max_edge"] / max(width, height)
+        width, height = int(width * scale_factor), int(height * scale_factor)
+
+    # 2. Further constrain the shortest edge
+    if min(width, height) > params["min_edge"]:
+        scale_factor = params["min_edge"] / min(width, height)
+        width, height = int(width * scale_factor), int(height * scale_factor)
+
+    # 3. Count how many tiles are needed to cover the image
+    tiles_width = ceil(width / params["tile_size"])
+    tiles_height = ceil(height / params["tile_size"])
+    total_tokens = params["base_token_count"] + params["token_multiplier"] * (tiles_width * tiles_height)
+
+    return total_tokens
diff --git a/autogen/token_count_utils.py b/autogen/token_count_utils.py
@@ -11,7 +11,20 @@
 
 import tiktoken
 
+try:
+    from autogen.agentchat.contrib.img_utils import num_tokens_from_gpt_image
+
+    img_util_imported = True
+except ImportError:
+
+    def num_tokens_from_gpt_image(*args, **kwargs):
+        return 0
+
+    img_util_imported = False
+
+
 logger = logging.getLogger(__name__)
+logger.img_dependency_warned = False  # member variable to track if the warning has been logged
 
 
 def get_max_token_limit(model: str = "gpt-3.5-turbo-0613") -> int:
@@ -113,6 +126,13 @@ def _num_token_from_messages(messages: Union[List, Dict], model="gpt-3.5-turbo-0
         "gpt-4-32k-0314",
         "gpt-4-0613",
         "gpt-4-32k-0613",
+        "gpt-4-turbo-preview",
+        "gpt-4-vision-preview",
+        "gpt-4o",
+        "gpt-4o-2024-05-13",
+        "gpt-4o-2024-08-06",
+        "gpt-4o-mini",
+        "gpt-4o-mini-2024-07-18",
     }:
         tokens_per_message = 3
         tokens_per_name = 1
@@ -145,6 +165,30 @@ def _num_token_from_messages(messages: Union[List, Dict], model="gpt-3.5-turbo-0
             if value is None:
                 continue
 
+            # handle content if images are in GPT-4-vision
+            if key == "content" and isinstance(value, list):
+                for part in value:
+                    if not isinstance(part, dict) or "type" not in part:
+                        continue
+                    if part["type"] == "text":
+                        num_tokens += len(encoding.encode(part["text"]))
+                    if "image_url" in part:
+                        assert "url" in part["image_url"]
+                        if not img_util_imported and not logger.img_dependency_warned:
+                            logger.warning(
+                                "img_utils or PIL not imported. Skipping image token count."
+                                "Please install autogen with [lmm] option.",
+                            )
+                            logger.img_dependency_warned = True
+                        is_low_quality = "detail" in part["image_url"] and part["image_url"]["detail"] == "low"
+                        try:
+                            num_tokens += num_tokens_from_gpt_image(
+                                image_data=part["image_url"]["url"], model=model, low_quality=is_low_quality
+                            )
+                        except ValueError as e:
+                            logger.warning(f"Error in num_tokens_from_gpt_image: {e}")
+                continue
+
             # function calls
             if not isinstance(value, str):
                 try:

diff --git a/test/agentchat/contrib/test_img_utils.py b/test/agentchat/contrib/test_img_utils.py
@@ -26,6 +26,7 @@
         gpt4v_formatter,
         llava_formatter,
         message_formatter_pil_to_b64,
+        num_tokens_from_gpt_image,
     )
 except ImportError:
     skip = True
@@ -296,5 +297,39 @@ def test_formatting(self):
         self.assertEqual(result, expected_output)
 
 
+class ImageTokenCountTest(unittest.TestCase):
+    def test_tokens(self):
+        # Note: Ground Truth manually fetched from https://openai.com/api/pricing/ in 2024/10/05
+        small_image = Image.new("RGB", (10, 10), color="red")
+        self.assertEqual(num_tokens_from_gpt_image(small_image), 85 + 170)
+        self.assertEqual(num_tokens_from_gpt_image(small_image, "gpt-4o"), 255)
+        self.assertEqual(num_tokens_from_gpt_image(small_image, "gpt-4o-mini"), 8500)
+
+        med_image = Image.new("RGB", (512, 1025), color="red")
+        self.assertEqual(num_tokens_from_gpt_image(med_image), 85 + 170 * 1 * 3)
+        self.assertEqual(num_tokens_from_gpt_image(med_image, "gpt-4o"), 595)
+        self.assertEqual(num_tokens_from_gpt_image(med_image, "gpt-4o-mini"), 19834)
+
+        tall_image = Image.new("RGB", (10, 1025), color="red")
+        self.assertEqual(num_tokens_from_gpt_image(tall_image), 85 + 170 * 1 * 3)
+        self.assertEqual(num_tokens_from_gpt_image(tall_image, "gpt-4o"), 595)
+        self.assertEqual(num_tokens_from_gpt_image(tall_image, "gpt-4o-mini"), 19834)
+
+        huge_image = Image.new("RGB", (10000, 10000), color="red")
+        self.assertEqual(num_tokens_from_gpt_image(huge_image), 85 + 170 * 2 * 2)
+        self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4o"), 765)
+        self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4o-mini"), 25501)
+
+        huge_wide_image = Image.new("RGB", (10000, 5000), color="red")
+        self.assertEqual(num_tokens_from_gpt_image(huge_wide_image), 85 + 170 * 3 * 2)
+        self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o"), 1105)
+        self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o-mini"), 36835)
+
+        # Handle low quality
+        self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4-vision", low_quality=True), 85)
+        self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o", low_quality=True), 85)
+        self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o-mini", low_quality=True), 2833)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_token_count.py b/test/test_token_count.py
@@ -8,6 +8,14 @@
 
 import pytest
 
+try:
+    from autogen.agentchat.contrib.img_utils import num_tokens_from_gpt_image
+
+    img_util_imported = True
+except ImportError:
+    img_util_imported = False
+
+
 from autogen.token_count_utils import (
     count_token,
     get_max_token_limit,
@@ -61,6 +69,52 @@ def test_num_tokens_from_functions(input_functions, expected_count):
     assert num_tokens_from_functions(input_functions) == expected_count
 
 
+@pytest.mark.skipif(not img_util_imported, reason="img_utils not imported")
+def test_num_tokens_from_gpt_image():
+    # mock num_tokens_from_gpt_image function
+    base64_encoded_image = (
+        "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4"
+        "//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=="
+    )
+
+    messages = [
+        {
+            "role": "system",
+            "content": "you are a helpful assistant. af3758 *3 33(3)",
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "hello asdfjj qeweee"},
+                {"type": "image_url", "image_url": {"url": base64_encoded_image}},
+            ],
+        },
+    ]
+    tokens = count_token(messages, model="gpt-4-vision-preview")
+
+    # The total number of tokens is text + image
+    # where text = 34, as shown in the previous test case
+    # the image token is: 85 + 170 = 255
+    assert tokens == 34 + 255
+
+    # Test low quality
+    messages = [
+        {
+            "role": "system",
+            "content": "you are a helpful assistant. af3758 *3 33(3)",
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "hello asdfjj qeweee"},
+                {"type": "image_url", "image_url": {"url": base64_encoded_image, "detail": "low"}},
+            ],
+        },
+    ]
+    tokens = count_token(messages, model="gpt-4o")
+    assert tokens == 34 + 85
+
+
 def test_count_token():
     messages = [
         {