Fix unsupported scheduler in llm_bench for Text2ImagePipeline (#1207)

Task : [CVS-156973](https://jira.devtools.intel.com/browse/CVS-156973) + disable genai for text2image
openvinotoolkit · Nov 13, 2024 · 158f662 · 158f662
2 parents bfab4bf + ff1a8b1
commit 158f662
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 19 deletions.
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -8,6 +8,7 @@
 import logging as log
 import torch
 import time
+import json
 import types
 from llm_bench_utils.hook_common import get_bench_hook
 from llm_bench_utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES
@@ -286,7 +287,8 @@ def create_image_gen_model(model_path, device, **kwargs):
         raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist')
     else:
         if kwargs.get("genai", False) and is_genai_available(log_msg=True):
-            return create_genai_image_gen_model(model_path, device, ov_config, **kwargs)
+            log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking")
+            # return create_genai_image_gen_model(model_path, device, ov_config, **kwargs)
 
         start = time.perf_counter()
         ov_model = model_class.from_pretrained(model_path, device=device, ov_config=ov_config)
@@ -296,15 +298,83 @@ def create_image_gen_model(model_path, device, **kwargs):
     return ov_model, from_pretrained_time, False
 
 
+def get_genai_clip_text_encoder(model_index_data, model_path, device, ov_config):
+    import openvino_genai
+    text_encoder_type = model_index_data.get("text_encoder", [])
+    if ("CLIPTextModel" in text_encoder_type):
+        text_encoder = openvino_genai.CLIPTextModel(model_path / "text_encoder", device.upper(), **ov_config)
+    else:
+        raise RuntimeError(f'==Failure ==: model by path:{model_path} has unsupported text encoder type {text_encoder_type}')
+
+    return text_encoder
+
+
+def get_genai_clip_text_encoder_with_projection(model_index_data, model_path, text_encoder_path, device, ov_config):
+    import openvino_genai
+    text_encoder_type = model_index_data.get(text_encoder_path, [])
+    if ("CLIPTextModelWithProjection" in text_encoder_type):
+        text_encoder = openvino_genai.CLIPTextModelWithProjection(model_path / text_encoder_path, device.upper(), **ov_config)
+    else:
+        raise RuntimeError(f'==Failure ==: model by path:{model_path} has unsupported {text_encoder_path} type {text_encoder_type}')
+
+    return text_encoder
+
+
+def get_genai_unet_model(model_index_data, model_path, device, ov_config):
+    import openvino_genai
+    unet_type = model_index_data.get("unet", [])
+    if ("UNet2DConditionModel" in unet_type):
+        unet = openvino_genai.UNet2DConditionModel(model_path / "unet", device.upper(), **ov_config)
+    else:
+        raise RuntimeError(f'==Failure ==: model by path:{model_path} has unsupported UNet type {unet_type}')
+
+    return unet
+
+
 def create_genai_image_gen_model(model_path, device, ov_config, **kwargs):
     import openvino_genai
 
     adapter_config = get_lora_config(kwargs.get("lora", None), kwargs.get("lora_alphas", []))
     if adapter_config:
         ov_config['adapters'] = adapter_config
 
+    data = {}
+    with open(str(model_path / "model_index.json"), 'r') as f:
+        data = json.load(f)
+
+    model_class_name = data.get("_class_name", "")
+
     start = time.perf_counter()
-    t2i_pipe = openvino_genai.Text2ImagePipeline(model_path, device.upper(), **ov_config)
+
+    scheduler_type = data.get("scheduler", ["", ""])[1]
+    if (scheduler_type not in ["LCMScheduler", "DDIMScheduler", "LMSDiscreteScheduler", "EulerDiscreteScheduler", "FlowMatchEulerDiscreteScheduler"]):
+        scheduler = openvino_genai.Scheduler.from_config(model_path / "scheduler/scheduler_config.json", openvino_genai.Scheduler.Type.DDIM)
+        log.warning(f'Type of scheduler {scheduler_type} is unsupported. Please, be aware that it will be replaced to DDIMScheduler')
+
+        vae_type = data.get("vae", [])
+        if ("AutoencoderKL" in vae_type):
+            vae = openvino_genai.AutoencoderKL(model_path / "vae_decoder", device.upper(), **ov_config)
+        else:
+            raise RuntimeError(f'==Failure ==: model by path:{model_path} has unsupported vae decoder type {vae_type}')
+
+        if model_class_name == "StableDiffusionPipeline":
+            text_encoder = get_genai_clip_text_encoder(data, model_path, device, ov_config)
+            unet = get_genai_unet_model(data, model_path, device, ov_config)
+            t2i_pipe = openvino_genai.Text2ImagePipeline.stable_diffusion(scheduler, text_encoder, unet, vae)
+        elif model_class_name == "LatentConsistencyModelPipeline":
+            text_encoder = get_genai_clip_text_encoder(data, model_path, device, ov_config)
+            unet = get_genai_unet_model(data, model_path, device, ov_config)
+            t2i_pipe = openvino_genai.Text2ImagePipeline.latent_consistency_model(scheduler, text_encoder, unet, vae)
+        elif model_class_name == "StableDiffusionXLPipeline":
+            clip_text_encoder = get_genai_clip_text_encoder(data, model_path, device, ov_config)
+            clip_text_encoder_2 = get_genai_clip_text_encoder_with_projection(data, model_path, "text_encoder_2", device, ov_config)
+            unet = get_genai_unet_model(data, model_path, device, ov_config)
+            t2i_pipe = openvino_genai.Text2ImagePipeline.stable_diffusion_xl(scheduler, clip_text_encoder, clip_text_encoder_2, unet, vae)
+        else:
+            raise RuntimeError(f'==Failure ==: model by path:{model_path} has unsupported _class_name {model_class_name}')
+    else:
+        t2i_pipe = openvino_genai.Text2ImagePipeline(model_path, device.upper(), **ov_config)
+
     end = time.perf_counter()
     log.info(f'Pipeline initialization time: {end - start:.2f}s')
     return t2i_pipe, end - start, True

diff --git a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py
@@ -120,15 +120,6 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
         return res
 
     def _generate_data(self, model, gen_image_fn=None, image_dir="reference"):
-        if hasattr(model, "reshape") and self.resolution is not None:
-            if gen_image_fn is None:
-                model.reshape(
-                    batch_size=1,
-                    height=self.resolution[0],
-                    width=self.resolution[1],
-                    num_images_per_prompt=1,
-                )
-
         def default_gen_image_fn(model, prompt, num_inference_steps, generator=None):
             output = model(
                 prompt,

diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -308,7 +308,7 @@ def parse_args():
     parser.add_argument(
         "--image-size",
         type=int,
-        default=512,
+        default=None,
         help="Text-to-image specific parameter that defines the image resolution.",
     )
     parser.add_argument(
@@ -388,13 +388,20 @@ def genai_gen_answer(model, tokenizer, question, max_new_tokens, skip_question):
 
 
 def genai_gen_image(model, prompt, num_inference_steps, generator=None):
-    image_tensor = model.generate(
-        prompt,
-        width=model.resolution[0],
-        height=model.resolution[1],
-        num_inference_steps=num_inference_steps,
-        generator=generator,
-    )
+    if model.resolution[0] is not None:
+        image_tensor = model.generate(
+            prompt,
+            width=model.resolution[0],
+            height=model.resolution[1],
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+        )
+    else:
+        image_tensor = model.generate(
+            prompt,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+        )
     image = Image.fromarray(image_tensor.data[0])
     return image