Skip to content

Commit

Permalink
Image tokens calculation: add for GPT-4v, 4o, 4o-mini (#57)
Browse files Browse the repository at this point in the history
* gpt-4v image token count

* Image token count for gpt-4o and gpt-4o-mini

* Img token configs into a global dict

---------

Co-authored-by: HRUSHIKESH DOKALA <[email protected]>
Co-authored-by: Qingyun Wu <[email protected]>
  • Loading branch information
3 people authored Oct 20, 2024
1 parent 5615d51 commit a81a6a7
Show file tree
Hide file tree
Showing 4 changed files with 217 additions and 0 deletions.
84 changes: 84 additions & 0 deletions autogen/agentchat/contrib/img_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,33 @@
import os
import re
from io import BytesIO
from math import ceil
from typing import Dict, List, Tuple, Union

import requests
from PIL import Image

from autogen.agentchat import utils

# Parameters for token counting for images for different models
MODEL_PARAMS = {
"gpt-4-vision": {
"max_edge": 2048,
"min_edge": 768,
"tile_size": 512,
"base_token_count": 85,
"token_multiplier": 170,
},
"gpt-4o-mini": {
"max_edge": 2048,
"min_edge": 768,
"tile_size": 512,
"base_token_count": 2833,
"token_multiplier": 5667,
},
"gpt-4o": {"max_edge": 2048, "min_edge": 768, "tile_size": 512, "base_token_count": 85, "token_multiplier": 170},
}


def get_pil_image(image_file: Union[str, Image.Image]) -> Image.Image:
"""
Expand Down Expand Up @@ -304,3 +324,67 @@ def message_formatter_pil_to_b64(messages: List[Dict]) -> List[Dict]:
new_messages.append(message)

return new_messages


def num_tokens_from_gpt_image(
image_data: Union[str, Image.Image], model: str = "gpt-4-vision", low_quality: bool = False
) -> int:
"""
Calculate the number of tokens required to process an image based on its dimensions
after scaling for different GPT models. Supports "gpt-4-vision", "gpt-4o", and "gpt-4o-mini".
This function scales the image so that its longest edge is at most 2048 pixels and its shortest
edge is at most 768 pixels (for "gpt-4-vision"). It then calculates the number of 512x512 tiles
needed to cover the scaled image and computes the total tokens based on the number of these tiles.
Reference: https://openai.com/api/pricing/
Args:
image_data : Union[str, Image.Image]: The image data which can either be a base64
encoded string, a URL, a file path, or a PIL Image object.
model: str: The model being used for image processing. Can be "gpt-4-vision", "gpt-4o", or "gpt-4o-mini".
Returns:
int: The total number of tokens required for processing the image.
Examples:
--------
>>> from PIL import Image
>>> img = Image.new('RGB', (2500, 2500), color = 'red')
>>> num_tokens_from_gpt_image(img, model="gpt-4-vision")
765
"""

image = get_pil_image(image_data) # PIL Image
width, height = image.size

# Determine model parameters
if "gpt-4-vision" in model or "gpt-4-turbo" in model or "gpt-4v" in model or "gpt-4-v" in model:
params = MODEL_PARAMS["gpt-4-vision"]
elif "gpt-4o-mini" in model:
params = MODEL_PARAMS["gpt-4o-mini"]
elif "gpt-4o" in model:
params = MODEL_PARAMS["gpt-4o"]
else:
raise ValueError(
f"Model {model} is not supported. Choose 'gpt-4-vision', 'gpt-4-turbo', 'gpt-4v', 'gpt-4-v', 'gpt-4o', or 'gpt-4o-mini'."
)

if low_quality:
return params["base_token_count"]

# 1. Constrain the longest edge
if max(width, height) > params["max_edge"]:
scale_factor = params["max_edge"] / max(width, height)
width, height = int(width * scale_factor), int(height * scale_factor)

# 2. Further constrain the shortest edge
if min(width, height) > params["min_edge"]:
scale_factor = params["min_edge"] / min(width, height)
width, height = int(width * scale_factor), int(height * scale_factor)

# 3. Count how many tiles are needed to cover the image
tiles_width = ceil(width / params["tile_size"])
tiles_height = ceil(height / params["tile_size"])
total_tokens = params["base_token_count"] + params["token_multiplier"] * (tiles_width * tiles_height)

return total_tokens
44 changes: 44 additions & 0 deletions autogen/token_count_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,20 @@

import tiktoken

try:
from autogen.agentchat.contrib.img_utils import num_tokens_from_gpt_image

img_util_imported = True
except ImportError:

def num_tokens_from_gpt_image(*args, **kwargs):
return 0

img_util_imported = False


logger = logging.getLogger(__name__)
logger.img_dependency_warned = False # member variable to track if the warning has been logged


def get_max_token_limit(model: str = "gpt-3.5-turbo-0613") -> int:
Expand Down Expand Up @@ -113,6 +126,13 @@ def _num_token_from_messages(messages: Union[List, Dict], model="gpt-3.5-turbo-0
"gpt-4-32k-0314",
"gpt-4-0613",
"gpt-4-32k-0613",
"gpt-4-turbo-preview",
"gpt-4-vision-preview",
"gpt-4o",
"gpt-4o-2024-05-13",
"gpt-4o-2024-08-06",
"gpt-4o-mini",
"gpt-4o-mini-2024-07-18",
}:
tokens_per_message = 3
tokens_per_name = 1
Expand Down Expand Up @@ -145,6 +165,30 @@ def _num_token_from_messages(messages: Union[List, Dict], model="gpt-3.5-turbo-0
if value is None:
continue

# handle content if images are in GPT-4-vision
if key == "content" and isinstance(value, list):
for part in value:
if not isinstance(part, dict) or "type" not in part:
continue
if part["type"] == "text":
num_tokens += len(encoding.encode(part["text"]))
if "image_url" in part:
assert "url" in part["image_url"]
if not img_util_imported and not logger.img_dependency_warned:
logger.warning(
"img_utils or PIL not imported. Skipping image token count."
"Please install autogen with [lmm] option.",
)
logger.img_dependency_warned = True
is_low_quality = "detail" in part["image_url"] and part["image_url"]["detail"] == "low"
try:
num_tokens += num_tokens_from_gpt_image(
image_data=part["image_url"]["url"], model=model, low_quality=is_low_quality
)
except ValueError as e:
logger.warning(f"Error in num_tokens_from_gpt_image: {e}")
continue

# function calls
if not isinstance(value, str):
try:
Expand Down
35 changes: 35 additions & 0 deletions test/agentchat/contrib/test_img_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
gpt4v_formatter,
llava_formatter,
message_formatter_pil_to_b64,
num_tokens_from_gpt_image,
)
except ImportError:
skip = True
Expand Down Expand Up @@ -296,5 +297,39 @@ def test_formatting(self):
self.assertEqual(result, expected_output)


class ImageTokenCountTest(unittest.TestCase):
def test_tokens(self):
# Note: Ground Truth manually fetched from https://openai.com/api/pricing/ in 2024/10/05
small_image = Image.new("RGB", (10, 10), color="red")
self.assertEqual(num_tokens_from_gpt_image(small_image), 85 + 170)
self.assertEqual(num_tokens_from_gpt_image(small_image, "gpt-4o"), 255)
self.assertEqual(num_tokens_from_gpt_image(small_image, "gpt-4o-mini"), 8500)

med_image = Image.new("RGB", (512, 1025), color="red")
self.assertEqual(num_tokens_from_gpt_image(med_image), 85 + 170 * 1 * 3)
self.assertEqual(num_tokens_from_gpt_image(med_image, "gpt-4o"), 595)
self.assertEqual(num_tokens_from_gpt_image(med_image, "gpt-4o-mini"), 19834)

tall_image = Image.new("RGB", (10, 1025), color="red")
self.assertEqual(num_tokens_from_gpt_image(tall_image), 85 + 170 * 1 * 3)
self.assertEqual(num_tokens_from_gpt_image(tall_image, "gpt-4o"), 595)
self.assertEqual(num_tokens_from_gpt_image(tall_image, "gpt-4o-mini"), 19834)

huge_image = Image.new("RGB", (10000, 10000), color="red")
self.assertEqual(num_tokens_from_gpt_image(huge_image), 85 + 170 * 2 * 2)
self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4o"), 765)
self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4o-mini"), 25501)

huge_wide_image = Image.new("RGB", (10000, 5000), color="red")
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image), 85 + 170 * 3 * 2)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o"), 1105)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o-mini"), 36835)

# Handle low quality
self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4-vision", low_quality=True), 85)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o", low_quality=True), 85)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o-mini", low_quality=True), 2833)


if __name__ == "__main__":
unittest.main()
54 changes: 54 additions & 0 deletions test/test_token_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@

import pytest

try:
from autogen.agentchat.contrib.img_utils import num_tokens_from_gpt_image

img_util_imported = True
except ImportError:
img_util_imported = False


from autogen.token_count_utils import (
count_token,
get_max_token_limit,
Expand Down Expand Up @@ -61,6 +69,52 @@ def test_num_tokens_from_functions(input_functions, expected_count):
assert num_tokens_from_functions(input_functions) == expected_count


@pytest.mark.skipif(not img_util_imported, reason="img_utils not imported")
def test_num_tokens_from_gpt_image():
# mock num_tokens_from_gpt_image function
base64_encoded_image = (
""
"//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=="
)

messages = [
{
"role": "system",
"content": "you are a helpful assistant. af3758 *3 33(3)",
},
{
"role": "user",
"content": [
{"type": "text", "text": "hello asdfjj qeweee"},
{"type": "image_url", "image_url": {"url": base64_encoded_image}},
],
},
]
tokens = count_token(messages, model="gpt-4-vision-preview")

# The total number of tokens is text + image
# where text = 34, as shown in the previous test case
# the image token is: 85 + 170 = 255
assert tokens == 34 + 255

# Test low quality
messages = [
{
"role": "system",
"content": "you are a helpful assistant. af3758 *3 33(3)",
},
{
"role": "user",
"content": [
{"type": "text", "text": "hello asdfjj qeweee"},
{"type": "image_url", "image_url": {"url": base64_encoded_image, "detail": "low"}},
],
},
]
tokens = count_token(messages, model="gpt-4o")
assert tokens == 34 + 85


def test_count_token():
messages = [
{
Expand Down

0 comments on commit a81a6a7

Please sign in to comment.