From 2a888dbd474bc3d6d862bdffd03a7026ab0e5eab Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Tue, 12 Nov 2024 03:46:29 +0100
Subject: [PATCH] Pixtral example

---
 examples/multimodal_pixtral.py              | 142 ++++++++++++++++++++
 {experimental => examples}/test_image_1.jpg | Bin
 {experimental => examples}/test_image_2.jpg | Bin
 experimental/multimodal_pixtral_hf.py       |  82 -----------
 4 files changed, 142 insertions(+), 82 deletions(-)
 create mode 100644 examples/multimodal_pixtral.py
 rename {experimental => examples}/test_image_1.jpg (100%)
 rename {experimental => examples}/test_image_2.jpg (100%)
 delete mode 100644 experimental/multimodal_pixtral_hf.py

diff --git a/examples/multimodal_pixtral.py b/examples/multimodal_pixtral.py
new file mode 100644
index 00000000..524b2904
--- /dev/null
+++ b/examples/multimodal_pixtral.py
@@ -0,0 +1,142 @@
+import sys, os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from exllamav2 import (
+    ExLlamaV2,
+    ExLlamaV2Config,
+    ExLlamaV2Cache,
+    ExLlamaV2Tokenizer,
+    ExLlamaV2VisionTower,
+)
+
+from exllamav2.generator import (
+    ExLlamaV2DynamicGenerator,
+    ExLlamaV2DynamicJob,
+    ExLlamaV2Sampler,
+)
+
+from PIL import Image
+import requests
+
+# Model used:
+#
+# Quantized: https://huggingface.co/turboderp/pixtral-12b-exl2
+# Unquantized: https://huggingface.co/mistral-community/pixtral-12b/
+
+model_directory = "/mnt/str/models/pixtral-12b-exl2/6.0bpw"
+config = ExLlamaV2Config(model_directory)
+config.max_seq_len = 16384  # default is 1M
+
+# Load vision model and multimodal projector and initialize preprocessor
+
+vision_model = ExLlamaV2VisionTower(config)
+vision_model.load(progress = True)
+
+# Load EXL2 model
+
+model = ExLlamaV2(config)
+cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
+model.load_autosplit(cache, progress = True)
+tokenizer = ExLlamaV2Tokenizer(config)
+
+# Create generator
+
+generator = ExLlamaV2DynamicGenerator(
+    model = model,
+    cache = cache,
+    tokenizer = tokenizer
+)
+
+# Util function to get a PIL image from a URL or from a file in the script's directory
+
+def get_image(file = None, url = None):
+    assert (file or url) and not (file and url)
+    if file:
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        file_path = os.path.join(script_dir, file)
+        return Image.open(file_path)
+    elif url:
+        return Image.open(requests.get(url, stream = True).raw)
+
+# Convert image(s) to embeddings
+
+image_embeddings = [
+    vision_model.get_image_embeddings(
+        model = model,
+        tokenizer = tokenizer,
+        image = img,
+        text_alias = alias,
+    )
+    for (alias, img) in [
+        ("{{IMAGE_1}}", get_image(file = "test_image_1.jpg")),
+        ("{{IMAGE_2}}", get_image(file = "test_image_2.jpg")),
+        # ("{{IMAGE_3}}", get_image(url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRSERy82bn3jpYKr1cNxMLXTyEsVvSt2wZOIQ&s")),
+    ]
+]
+
+# Define a prompt using the aliases above as placeholders for image tokens. The tokenizer will replace each alias
+# with a range of temporary token IDs, and the model will embed those temporary IDs from their respective sources
+# rather than the model's text embedding table.
+#
+# The temporary IDs are unique for the lifetime of the process and persist as long as a reference is held to the
+# corresponding ExLlamaV2Embedding object. This way, images can be reused between generations, or used multiple
+# for multiple jobs in a batch, and the generator will be able to apply prompt caching and deduplication to image
+# tokens as well as text tokens.
+#
+# Image token IDs are assigned sequentially, however, so two ExLlamaV2Embedding objects created from the same
+# source image will not be recognized as the same image for purposes of prompt caching etc.
+
+prompt = "[INST]{{IMAGE_1}}{{IMAGE_2}}\n" + \
+         "What are the similarities and differences between these two experiments?[/INST]"
+
+# Generate
+
+streaming = True
+greedy = True
+
+if streaming:
+
+    input_ids = tokenizer.encode(
+        prompt,
+        add_bos = True,
+        encode_special_tokens = True,
+        embeddings = image_embeddings,
+    )
+
+    job = ExLlamaV2DynamicJob(
+        input_ids = input_ids,
+        max_new_tokens = 500,
+        decode_special_tokens = True,
+        stop_conditions = [tokenizer.eos_token_id],
+        gen_settings = ExLlamaV2Sampler.Settings.greedy() if greedy else None,
+        embeddings = image_embeddings,
+    )
+
+    generator.enqueue(job)
+
+    print()
+    print(prompt, end = ""); sys.stdout.flush()
+
+    eos = False
+    while generator.num_remaining_jobs():
+        results = generator.iterate()
+        for result in results:
+            text = result.get("text", "")
+            print(text, end = ""); sys.stdout.flush()
+
+    print()
+
+else:
+
+    output = generator.generate(
+        prompt = prompt,
+        max_new_tokens = 500,
+        add_bos = True,
+        encode_special_tokens = True,
+        decode_special_tokens = True,
+        stop_conditions = [tokenizer.eos_token_id],
+        gen_settings = ExLlamaV2Sampler.Settings.greedy() if greedy else None,
+        embeddings = image_embeddings,
+    )
+
+    print(output)
\ No newline at end of file
diff --git a/experimental/test_image_1.jpg b/examples/test_image_1.jpg
similarity index 100%
rename from experimental/test_image_1.jpg
rename to examples/test_image_1.jpg
diff --git a/experimental/test_image_2.jpg b/examples/test_image_2.jpg
similarity index 100%
rename from experimental/test_image_2.jpg
rename to examples/test_image_2.jpg
diff --git a/experimental/multimodal_pixtral_hf.py b/experimental/multimodal_pixtral_hf.py
deleted file mode 100644
index a9e293c4..00000000
--- a/experimental/multimodal_pixtral_hf.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import sys, os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-import torch
-
-from exllamav2 import (
-    ExLlamaV2,
-    ExLlamaV2Config,
-    ExLlamaV2Cache,
-    ExLlamaV2Tokenizer,
-    ExLlamaV2VisionTower,
-)
-
-from exllamav2.generator import (
-    ExLlamaV2DynamicGenerator,
-    ExLlamaV2Sampler,
-)
-
-from PIL import Image
-import requests
-
-# Unquantized model used for experiment:
-#
-# https://huggingface.co/mistral-community/pixtral-12b/
-
-model_directory = "/mnt/str/models/pixtral-12b-exl2/5.0bpw"
-config = ExLlamaV2Config(model_directory)
-config.max_seq_len = 16384  # default is 1M
-
-# Load vision model and multimodal projector and initialize preprocessor
-
-vision_model = ExLlamaV2VisionTower(config)
-vision_model.load(progress = True)
-
-# Load EXL2 model
-
-model = ExLlamaV2(config)
-cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
-model.load_autosplit(cache, progress = True)
-tokenizer = ExLlamaV2Tokenizer(config)
-
-# Create generator
-
-generator = ExLlamaV2DynamicGenerator(
-    model = model,
-    cache = cache,
-    tokenizer = tokenizer
-)
-
-# Create an MMEmbedding for the image features and a prompt containing the placeholder string
-
-image_embeddings = [
-    vision_model.get_image_embeddings(
-        model = model,
-        tokenizer = tokenizer,
-        image = img,
-        text_alias = alias,
-    )
-    for (alias, img) in [
-        ("{{IMAGE_1}}", Image.open("test_image_1.jpg")),
-        ("{{IMAGE_2}}", Image.open("test_image_2.jpg")),
-    ]
-]
-
-prompt = "[INST]{{IMAGE_1}}{{IMAGE_2}}\n" + \
-         "What are the similarities and differences between these two experiments?[/INST]"
-
-# Run prompt through generator, with embeddings. The tokenizer will insert preepared image tokens in place
-# of the aliases
-
-output = generator.generate(
-    prompt = prompt,
-    max_new_tokens = 500,
-    add_bos = True,
-    encode_special_tokens = True,
-    decode_special_tokens = True,
-    stop_conditions = [tokenizer.eos_token_id],
-    gen_settings = ExLlamaV2Sampler.Settings.greedy(),
-    embeddings = image_embeddings,
-)
-
-print(output)
\ No newline at end of file