Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Image tokens calculation: add for GPT-4v, 4o, 4o-mini #57

Merged
merged 7 commits into from
Oct 20, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions autogen/agentchat/contrib/img_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import os
import re
from io import BytesIO
from math import ceil
from typing import Dict, List, Tuple, Union

import requests
Expand Down Expand Up @@ -304,3 +305,70 @@ def message_formatter_pil_to_b64(messages: List[Dict]) -> List[Dict]:
new_messages.append(message)

return new_messages


def num_tokens_from_gpt_image(
image_data: Union[str, Image.Image], model: str = "gpt-4-vision", low_quality: bool = False
) -> int:
"""
Calculate the number of tokens required to process an image based on its dimensions
after scaling for different GPT models. Supports "gpt-4-vision", "gpt-4o", and "gpt-4o-mini".
This function scales the image so that its longest edge is at most 2048 pixels and its shortest
edge is at most 768 pixels (for "gpt-4-vision"). It then calculates the number of 512x512 tiles
needed to cover the scaled image and computes the total tokens based on the number of these tiles.

Reference: https://openai.com/api/pricing/

Args:
image_data : Union[str, Image.Image]: The image data which can either be a base64
encoded string, a URL, a file path, or a PIL Image object.
model: str: The model being used for image processing. Can be "gpt-4-vision", "gpt-4o", or "gpt-4o-mini".

Returns:
int: The total number of tokens required for processing the image.

Examples:
--------
>>> from PIL import Image
>>> img = Image.new('RGB', (2500, 2500), color = 'red')
>>> num_tokens_from_gpt_image(img, model="gpt-4-vision")
765
"""

image = get_pil_image(image_data) # PIL Image
width, height = image.size

# Scaling factors and tile sizes may differ depending on the model
if "gpt-4-vision" in model or "gpt-4-turbo" in model or "gpt-4v" in model or "gpt-4-v" in model:
max_edge, min_edge, tile_size = 2048, 768, 512
base_token_count, token_multiplier = 85, 170
elif "gpt-4o-mini" in model:
max_edge, min_edge, tile_size = 2048, 768, 512
base_token_count, token_multiplier = 2833, 5667
elif "gpt-4o" in model:
max_edge, min_edge, tile_size = 2048, 768, 512
base_token_count, token_multiplier = 85, 170
BabyCNM marked this conversation as resolved.
Show resolved Hide resolved
else:
raise ValueError(
f"Model {model} is not supported. Choose 'gpt-4-vision', 'gpt-4-turbo', 'gpt-4v', 'gpt-4-v', 'gpt-4o', or 'gpt-4o-mini'."
)

if low_quality:
return base_token_count

# 1. Constrain the longest edge
if max(width, height) > max_edge:
scale_factor = max_edge / max(width, height)
width, height = int(width * scale_factor), int(height * scale_factor)

# 2. Further constrain the shortest edge
if min(width, height) > min_edge:
scale_factor = min_edge / min(width, height)
width, height = int(width * scale_factor), int(height * scale_factor)

# 3. Count how many tiles are needed to cover the image
tiles_width = ceil(width / tile_size)
tiles_height = ceil(height / tile_size)
total_tokens = base_token_count + token_multiplier * (tiles_width * tiles_height)

return total_tokens
44 changes: 44 additions & 0 deletions autogen/token_count_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,20 @@

import tiktoken

try:
from autogen.agentchat.contrib.img_utils import num_tokens_from_gpt_image

img_util_imported = True
except ImportError:

def num_tokens_from_gpt_image(*args, **kwargs):
return 0

img_util_imported = False


logger = logging.getLogger(__name__)
logger.img_dependency_warned = False # member variable to track if the warning has been logged


def get_max_token_limit(model: str = "gpt-3.5-turbo-0613") -> int:
Expand Down Expand Up @@ -113,6 +126,13 @@ def _num_token_from_messages(messages: Union[List, Dict], model="gpt-3.5-turbo-0
"gpt-4-32k-0314",
"gpt-4-0613",
"gpt-4-32k-0613",
"gpt-4-turbo-preview",
"gpt-4-vision-preview",
"gpt-4o",
"gpt-4o-2024-05-13",
"gpt-4o-2024-08-06",
"gpt-4o-mini",
"gpt-4o-mini-2024-07-18",
}:
tokens_per_message = 3
tokens_per_name = 1
Expand Down Expand Up @@ -145,6 +165,30 @@ def _num_token_from_messages(messages: Union[List, Dict], model="gpt-3.5-turbo-0
if value is None:
continue

# handle content if images are in GPT-4-vision
if key == "content" and isinstance(value, list):
for part in value:
if not isinstance(part, dict) or "type" not in part:
continue
if part["type"] == "text":
num_tokens += len(encoding.encode(part["text"]))
if "image_url" in part:
assert "url" in part["image_url"]
if not img_util_imported and not logger.img_dependency_warned:
logger.warning(
"img_utils or PIL not imported. Skipping image token count."
"Please install autogen with [lmm] option.",
)
logger.img_dependency_warned = True
is_low_quality = "detail" in part["image_url"] and part["image_url"]["detail"] == "low"
try:
num_tokens += num_tokens_from_gpt_image(
image_data=part["image_url"]["url"], model=model, low_quality=is_low_quality
)
except ValueError as e:
logger.warning(f"Error in num_tokens_from_gpt_image: {e}")
continue

# function calls
if not isinstance(value, str):
try:
Expand Down
35 changes: 35 additions & 0 deletions test/agentchat/contrib/test_img_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
gpt4v_formatter,
llava_formatter,
message_formatter_pil_to_b64,
num_tokens_from_gpt_image,
)
except ImportError:
skip = True
Expand Down Expand Up @@ -296,5 +297,39 @@ def test_formatting(self):
self.assertEqual(result, expected_output)


class ImageTokenCountTest(unittest.TestCase):
def test_tokens(self):
# Note: Ground Truth manually fetched from https://openai.com/api/pricing/ in 2024/10/05
small_image = Image.new("RGB", (10, 10), color="red")
self.assertEqual(num_tokens_from_gpt_image(small_image), 85 + 170)
self.assertEqual(num_tokens_from_gpt_image(small_image, "gpt-4o"), 255)
self.assertEqual(num_tokens_from_gpt_image(small_image, "gpt-4o-mini"), 8500)

med_image = Image.new("RGB", (512, 1025), color="red")
self.assertEqual(num_tokens_from_gpt_image(med_image), 85 + 170 * 1 * 3)
self.assertEqual(num_tokens_from_gpt_image(med_image, "gpt-4o"), 595)
self.assertEqual(num_tokens_from_gpt_image(med_image, "gpt-4o-mini"), 19834)

tall_image = Image.new("RGB", (10, 1025), color="red")
self.assertEqual(num_tokens_from_gpt_image(tall_image), 85 + 170 * 1 * 3)
self.assertEqual(num_tokens_from_gpt_image(tall_image, "gpt-4o"), 595)
self.assertEqual(num_tokens_from_gpt_image(tall_image, "gpt-4o-mini"), 19834)

huge_image = Image.new("RGB", (10000, 10000), color="red")
self.assertEqual(num_tokens_from_gpt_image(huge_image), 85 + 170 * 2 * 2)
self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4o"), 765)
self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4o-mini"), 25501)

huge_wide_image = Image.new("RGB", (10000, 5000), color="red")
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image), 85 + 170 * 3 * 2)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o"), 1105)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o-mini"), 36835)

# Handle low quality
self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4-vision", low_quality=True), 85)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o", low_quality=True), 85)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o-mini", low_quality=True), 2833)


if __name__ == "__main__":
unittest.main()
54 changes: 54 additions & 0 deletions test/test_token_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@

import pytest

try:
from autogen.agentchat.contrib.img_utils import num_tokens_from_gpt_image

img_util_imported = True
except ImportError:
img_util_imported = False


from autogen.token_count_utils import (
count_token,
get_max_token_limit,
Expand Down Expand Up @@ -61,6 +69,52 @@ def test_num_tokens_from_functions(input_functions, expected_count):
assert num_tokens_from_functions(input_functions) == expected_count


@pytest.mark.skipif(not img_util_imported, reason="img_utils not imported")
def test_num_tokens_from_gpt_image():
# mock num_tokens_from_gpt_image function
base64_encoded_image = (
"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4"
"//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=="
)

messages = [
{
"role": "system",
"content": "you are a helpful assistant. af3758 *3 33(3)",
},
{
"role": "user",
"content": [
{"type": "text", "text": "hello asdfjj qeweee"},
{"type": "image_url", "image_url": {"url": base64_encoded_image}},
],
},
]
tokens = count_token(messages, model="gpt-4-vision-preview")

# The total number of tokens is text + image
# where text = 34, as shown in the previous test case
# the image token is: 85 + 170 = 255
assert tokens == 34 + 255

# Test low quality
messages = [
{
"role": "system",
"content": "you are a helpful assistant. af3758 *3 33(3)",
},
{
"role": "user",
"content": [
{"type": "text", "text": "hello asdfjj qeweee"},
{"type": "image_url", "image_url": {"url": base64_encoded_image, "detail": "low"}},
],
},
]
tokens = count_token(messages, model="gpt-4o")
assert tokens == 34 + 85


def test_count_token():
messages = [
{
Expand Down
Loading