enable ipex/ov example

huggingface · Dec 10, 2024 · ef84a5a · ef84a5a
1 parent bb2a05a
commit ef84a5a
Show file tree

Hide file tree

Showing 13 changed files with 52 additions and 55 deletions.
diff --git a/.github/workflows/test_cli_cpu_ipex.yaml b/.github/workflows/test_cli_cpu_ipex.yaml
@@ -43,11 +43,10 @@ jobs:
       - name: Run tests
         run: pytest tests/test_cli.py -s -k "cli and cpu and ipex"
 
-      # examples require specific machines
-      # - if: ${{
-      #     (github.event_name == 'push') ||
-      #     (github.event_name == 'workflow_dispatch') ||
-      #     contains( github.event.pull_request.labels.*.name, 'examples')
-      #     }}
-      #   name: Run examples
-      #   run: pytest tests/test_examples.py -s -k "cli and cpu and ipex"
+      - if: ${{
+          (github.event_name == 'push') ||
+          (github.event_name == 'workflow_dispatch') ||
+          contains( github.event.pull_request.labels.*.name, 'examples')
+          }}
+        name: Run examples
+        run: pytest tests/test_examples.py -s -k "cli and cpu and ipex"
diff --git a/.github/workflows/test_cli_cpu_openvino.yaml b/.github/workflows/test_cli_cpu_openvino.yaml
@@ -44,11 +44,10 @@ jobs:
       - name: Run tests
         run: pytest tests/test_cli.py -s -k "cli and cpu and openvino"
 
-      # examples require specific machines
-      # - if: ${{
-      #     (github.event_name == 'push') ||
-      #     (github.event_name == 'workflow_dispatch') ||
-      #     contains( github.event.pull_request.labels.*.name, 'examples')
-      #     }}
-      #   name: Run examples
-      #   run: pytest tests/test_examples.py -s -k "cli and cpu and openvino"
+      - if: ${{
+          (github.event_name == 'push') ||
+          (github.event_name == 'workflow_dispatch') ||
+          contains( github.event.pull_request.labels.*.name, 'examples')
+          }}
+        name: Run examples
+        run: pytest tests/test_examples.py -s -k "cli and cpu and openvino"
diff --git a/examples/cpu_ipex_bert.yaml b/examples/cpu_ipex_bert.yaml
@@ -17,8 +17,8 @@ launcher:
 backend:
   device: cpu
   export: true
-  no_weights: false
-  torch_dtype: bfloat16
+  no_weights: false # because on multi-node machines, intializing weights could harm performance
+  torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
   model: google-bert/bert-base-uncased
 
 scenario:

diff --git a/examples/cpu_ipex_llama.yaml b/examples/cpu_ipex_llama.yaml
@@ -17,21 +17,17 @@ launcher:
 backend:
   device: cpu
   export: true
-  no_weights: false
-  torch_dtype: bfloat16
+  no_weights: false # because on multi-node machines, intializing weights could harm performance
+  torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
   memory: true
   latency: true
 
-  warmup_runs: 10
-  iterations: 10
-  duration: 10
-
   input_shapes:
     batch_size: 1
-    sequence_length: 256
+    sequence_length: 64
 
   generate_kwargs:
     max_new_tokens: 32

diff --git a/examples/cpu_llama_cpp_embedding.yaml b/examples/cpu_llama_cpp_embedding.yaml
@@ -17,8 +17,8 @@ backend:
 scenario:
   input_shapes:
     batch_size: 1
-    sequence_length: 256
+    sequence_length: 64
 
   generate_kwargs:
-    max_new_tokens: 100
-    min_new_tokens: 100
+    max_new_tokens: 32
+    min_new_tokens: 32
diff --git a/examples/cpu_llama_cpp_text_generation.yaml b/examples/cpu_llama_cpp_text_generation.yaml
@@ -15,10 +15,9 @@ backend:
   filename: tinyllama-1.1b-chat-v1.0.Q4_0.gguf
 
 scenario:
+  memory: true
+  latency: true
+
   input_shapes:
     batch_size: 1
-    sequence_length: 256
-
-  generate_kwargs:
-    max_new_tokens: 100
-    min_new_tokens: 100
+    sequence_length: 128
diff --git a/examples/cpu_onnxruntime_static_quant_vit.yaml b/examples/cpu_onnxruntime_static_quant_vit.yaml
@@ -18,3 +18,9 @@ backend:
     is_static: true
     per_channel: false
   calibration: true
+
+scenario:
+  memory: true
+  latency: true
+  input_shapes:
+    batch_size: 2
diff --git a/examples/cpu_openvino_8bit.yaml → examples/cpu_openvino_8bit_bert.yaml b/examples/cpu_openvino_8bit.yaml → examples/cpu_openvino_8bit_bert.yaml
@@ -16,6 +16,9 @@ backend:
   model: google-bert/bert-base-uncased
 
 scenario:
+  memory: true
+  latency: true
+
   input_shapes:
     batch_size: 1
-    sequence_length: 16
+    sequence_length: 128
diff --git a/examples/cpu_openvino_diffusion.yaml b/examples/cpu_openvino_diffusion.yaml
@@ -9,10 +9,10 @@ defaults:
 name: openvino_diffusion
 
 backend:
-  half: true
   device: cpu
   export: true
   model: stabilityai/stable-diffusion-2-1
+  half: false # enable half-precision on compatible Intel CPU machines
 
 scenario:
   input_shapes:

diff --git a/examples/cuda_pytorch_bert.py b/examples/cuda_pytorch_bert.py
@@ -8,7 +8,11 @@
 PUSH_REPO_ID = os.environ.get("PUSH_REPO_ID", None)
 
 
-def run_benchmark():
+if __name__ == "__main__":
+    level = os.environ.get("LOG_LEVEL", "INFO")
+    to_file = os.environ.get("LOG_TO_FILE", "0") == "1"
+    setup_logging(level=level, to_file=to_file, prefix="MAIN-PROCESS")
+
     launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn")
     backend_config = PyTorchConfig(device="cuda", device_ids="0", no_weights=True, model=MODEL)
     scenario_config = InferenceConfig(memory=True, latency=True, input_shapes={"batch_size": 1, "sequence_length": 128})
@@ -21,16 +25,6 @@ def run_benchmark():
         log_report=True,
     )
     benchmark_report = Benchmark.launch(benchmark_config)
-
-    return benchmark_config, benchmark_report
-
-
-if __name__ == "__main__":
-    level = os.environ.get("LOG_LEVEL", "INFO")
-    to_file = os.environ.get("LOG_TO_FILE", "0") == "1"
-    setup_logging(level=level, to_file=to_file, prefix="MAIN-PROCESS")
-
-    benchmark_config, benchmark_report = run_benchmark()
     benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
 
     if PUSH_REPO_ID is not None:

diff --git a/examples/cuda_pytorch_bert.yaml b/examples/cuda_pytorch_bert.yaml
@@ -12,15 +12,16 @@ launcher:
   device_isolation: true
   device_isolation_action: warn
 
+backend:
+  device: cuda
+  device_ids: 0
+  no_weights: true
+  model: google-bert/bert-base-uncased
+
 scenario:
-  latency: true
   memory: true
+  latency: true
+
   input_shapes:
     batch_size: 1
     sequence_length: 128
-
-backend:
-  device: cuda
-  device_ids: 0
-  no_weights: true
-  model: bert-base-uncased
diff --git a/examples/cuda_pytorch_llama_quants.py b/examples/cuda_pytorch_llama_quants.py
@@ -46,7 +46,7 @@ def run_benchmark(weight_config: str):
         duration=10,
         iterations=10,
         warmup_runs=10,
-        input_shapes={"batch_size": 1, "sequence_length": 128},
+        input_shapes={"batch_size": 1, "sequence_length": 64},
         generate_kwargs={"max_new_tokens": 32, "min_new_tokens": 32},
     )
     benchmark_config = BenchmarkConfig(

diff --git a/examples/cuda_pytorch_vlm.yaml b/examples/cuda_pytorch_vlm.yaml
@@ -30,7 +30,7 @@ scenario:
   input_shapes:
     # text
     batch_size: 1
-    sequence_length: 256
+    sequence_length: 64
     # image
     num_images: 2
     num_channels: 3