Skip to content

Commit

Permalink
Image token count for gpt-4o and gpt-4o-mini
Browse files Browse the repository at this point in the history
  • Loading branch information
BabyCNM committed Oct 5, 2024
1 parent bfae7d4 commit ddcb8b4
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 24 deletions.
65 changes: 44 additions & 21 deletions autogen/agentchat/contrib/img_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,45 +307,68 @@ def message_formatter_pil_to_b64(messages: List[Dict]) -> List[Dict]:
return new_messages


def num_tokens_from_gpt_image(image_data: Union[str, Image.Image]) -> int:
def num_tokens_from_gpt_image(
image_data: Union[str, Image.Image], model: str = "gpt-4-vision", low_quality: bool = False
) -> int:
"""
Calculate the number of tokens required to process an image based on its dimensions after scaling.
This function scales the image so that its longest edge is at most 2048 pixels and its shortest edge
is at most 768 pixels. It then calculates the number of 512x512 tiles needed to cover the scaled
image and computes the total tokens based on the number of these tiles.
See more official details at:
- https://openai.com/pricing
- https://platform.openai.com/docs/guides/vision
See community discussion of OpenAI at:
- https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/
Calculate the number of tokens required to process an image based on its dimensions
after scaling for different GPT models. Supports "gpt-4-vision", "gpt-4o", and "gpt-4o-mini".
This function scales the image so that its longest edge is at most 2048 pixels and its shortest
edge is at most 768 pixels (for "gpt-4-vision"). It then calculates the number of 512x512 tiles
needed to cover the scaled image and computes the total tokens based on the number of these tiles.
Reference: https://openai.com/api/pricing/
Args:
image_data : Union[str, Image.Image]: The image data which can either be a base64
encoded string, a URL, a file path, or a PIL Image object.
model: str: The model being used for image processing. Can be "gpt-4-vision", "gpt-4o", or "gpt-4o-mini".
Returns:
int: The total number of tokens required for processing the image.
Examples
Examples:
--------
>>> from PIL import Image
>>> img = Image.new('RGB', (2500, 2500), color = 'red')
>>> num_tokens_from_gpt_image(img)
>>> num_tokens_from_gpt_image(img, model="gpt-4-vision")
765
"""

image = get_pil_image(image_data) # PIL Image
width, height = image.size

# 1. Constrain the longest edge to 2048 pixels
if max(width, height) > 2048:
scale_factor = 2048.0 / max(width, height)
# Scaling factors and tile sizes may differ depending on the model
if "gpt-4-vision" in model or "gpt-4-turbo" in model or "gpt-4v" in model or "gpt-4-v" in model:
max_edge, min_edge, tile_size = 2048, 768, 512
base_token_count, token_multiplier = 85, 170
elif "gpt-4o-mini" in model:
max_edge, min_edge, tile_size = 2048, 768, 512
base_token_count, token_multiplier = 2833, 5667
elif "gpt-4o" in model:
max_edge, min_edge, tile_size = 2048, 768, 512
base_token_count, token_multiplier = 85, 170
else:
raise ValueError(
f"Model {model} is not supported. Choose 'gpt-4-vision', 'gpt-4-turbo', 'gpt-4v', 'gpt-4-v', 'gpt-4o', or 'gpt-4o-mini'."
)

if low_quality:
return base_token_count

# 1. Constrain the longest edge
if max(width, height) > max_edge:
scale_factor = max_edge / max(width, height)
width, height = int(width * scale_factor), int(height * scale_factor)

# 2. Further constrain the shortest edge to 768 pixels
if min(width, height) > 768:
scale_factor = 768.0 / min(width, height)
# 2. Further constrain the shortest edge
if min(width, height) > min_edge:
scale_factor = min_edge / min(width, height)
width, height = int(width * scale_factor), int(height * scale_factor)

# 3. Count how many tiles are needed to cover the image
tiles_width = ceil(width / 512)
tiles_height = ceil(height / 512)
total_tokens = 85 + 170 * (tiles_width * tiles_height)
tiles_width = ceil(width / tile_size)
tiles_height = ceil(height / tile_size)
total_tokens = base_token_count + token_multiplier * (tiles_width * tiles_height)

return total_tokens
21 changes: 18 additions & 3 deletions autogen/token_count_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
img_util_imported = True
except ImportError:

def num_tokens_from_gpt_image(_):
def num_tokens_from_gpt_image(*args, **kwargs):
return 0

img_util_imported = False


logger = logging.getLogger(__name__)
logger.img_dependency_warned = False # member variable to track if the warning has been logged


def get_max_token_limit(model: str = "gpt-3.5-turbo-0613") -> int:
Expand Down Expand Up @@ -125,6 +126,13 @@ def _num_token_from_messages(messages: Union[List, Dict], model="gpt-3.5-turbo-0
"gpt-4-32k-0314",
"gpt-4-0613",
"gpt-4-32k-0613",
"gpt-4-turbo-preview",
"gpt-4-vision-preview",
"gpt-4o",
"gpt-4o-2024-05-13",
"gpt-4o-2024-08-06",
"gpt-4o-mini",
"gpt-4o-mini-2024-07-18",
}:
tokens_per_message = 3
tokens_per_name = 1
Expand Down Expand Up @@ -166,12 +174,19 @@ def _num_token_from_messages(messages: Union[List, Dict], model="gpt-3.5-turbo-0
num_tokens += len(encoding.encode(part["text"]))
if "image_url" in part:
assert "url" in part["image_url"]
if not img_util_imported:
if not img_util_imported and not logger.img_dependency_warned:
logger.warning(
"img_utils or PIL not imported. Skipping image token count."
"Please install autogen with [lmm] option.",
)
num_tokens += num_tokens_from_gpt_image(part["image_url"]["url"])
logger.img_dependency_warned = True
is_low_quality = "detail" in part["image_url"] and part["image_url"]["detail"] == "low"
try:
num_tokens += num_tokens_from_gpt_image(
image_data=part["image_url"]["url"], model=model, low_quality=is_low_quality
)
except ValueError as e:
logger.warning(f"Error in num_tokens_from_gpt_image: {e}")
continue

# function calls
Expand Down
16 changes: 16 additions & 0 deletions test/agentchat/contrib/test_img_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,20 +299,36 @@ def test_formatting(self):

class ImageTokenCountTest(unittest.TestCase):
def test_tokens(self):
# Note: Ground Truth manually fetched from https://openai.com/api/pricing/ in 2024/10/05
small_image = Image.new("RGB", (10, 10), color="red")
self.assertEqual(num_tokens_from_gpt_image(small_image), 85 + 170)
self.assertEqual(num_tokens_from_gpt_image(small_image, "gpt-4o"), 255)
self.assertEqual(num_tokens_from_gpt_image(small_image, "gpt-4o-mini"), 8500)

med_image = Image.new("RGB", (512, 1025), color="red")
self.assertEqual(num_tokens_from_gpt_image(med_image), 85 + 170 * 1 * 3)
self.assertEqual(num_tokens_from_gpt_image(med_image, "gpt-4o"), 595)
self.assertEqual(num_tokens_from_gpt_image(med_image, "gpt-4o-mini"), 19834)

tall_image = Image.new("RGB", (10, 1025), color="red")
self.assertEqual(num_tokens_from_gpt_image(tall_image), 85 + 170 * 1 * 3)
self.assertEqual(num_tokens_from_gpt_image(tall_image, "gpt-4o"), 595)
self.assertEqual(num_tokens_from_gpt_image(tall_image, "gpt-4o-mini"), 19834)

huge_image = Image.new("RGB", (10000, 10000), color="red")
self.assertEqual(num_tokens_from_gpt_image(huge_image), 85 + 170 * 2 * 2)
self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4o"), 765)
self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4o-mini"), 25501)

huge_wide_image = Image.new("RGB", (10000, 5000), color="red")
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image), 85 + 170 * 3 * 2)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o"), 1105)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o-mini"), 36835)

# Handle low quality
self.assertEqual(num_tokens_from_gpt_image(huge_image, "gpt-4-vision", low_quality=True), 85)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o", low_quality=True), 85)
self.assertEqual(num_tokens_from_gpt_image(huge_wide_image, "gpt-4o-mini", low_quality=True), 2833)


if __name__ == "__main__":
Expand Down
17 changes: 17 additions & 0 deletions test/test_token_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,23 @@ def test_num_tokens_from_gpt_image():
# the image token is: 85 + 170 = 255
assert tokens == 34 + 255

# Test low quality
messages = [
{
"role": "system",
"content": "you are a helpful assistant. af3758 *3 33(3)",
},
{
"role": "user",
"content": [
{"type": "text", "text": "hello asdfjj qeweee"},
{"type": "image_url", "image_url": {"url": base64_encoded_image, "detail": "low"}},
],
},
]
tokens = count_token(messages, model="gpt-4o")
assert tokens == 34 + 85


def test_count_token():
messages = [
Expand Down

0 comments on commit ddcb8b4

Please sign in to comment.