diff --git a/docs/contents/ai/socratiq.html b/docs/contents/ai/socratiq.html
index 73b009a..7820601 100644
--- a/docs/contents/ai/socratiq.html
+++ b/docs/contents/ai/socratiq.html
@@ -438,6 +438,12 @@
   <a href="../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/about/about.html b/docs/contents/core/about/about.html
index aaa02ae..f7acf81 100644
--- a/docs/contents/core/about/about.html
+++ b/docs/contents/core/about/about.html
@@ -436,6 +436,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/acknowledgements/acknowledgements.html b/docs/contents/core/acknowledgements/acknowledgements.html
index 450e5e5..29d86d3 100644
--- a/docs/contents/core/acknowledgements/acknowledgements.html
+++ b/docs/contents/core/acknowledgements/acknowledgements.html
@@ -439,6 +439,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/ai_for_good/ai_for_good.html b/docs/contents/core/ai_for_good/ai_for_good.html
index e95b9ef..b7e8887 100644
--- a/docs/contents/core/ai_for_good/ai_for_good.html
+++ b/docs/contents/core/ai_for_good/ai_for_good.html
@@ -488,6 +488,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/benchmarking/benchmarking.html b/docs/contents/core/benchmarking/benchmarking.html
index d242bba..a0b9482 100644
--- a/docs/contents/core/benchmarking/benchmarking.html
+++ b/docs/contents/core/benchmarking/benchmarking.html
@@ -459,6 +459,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/conclusion/conclusion.html b/docs/contents/core/conclusion/conclusion.html
index fad284f..d1948d7 100644
--- a/docs/contents/core/conclusion/conclusion.html
+++ b/docs/contents/core/conclusion/conclusion.html
@@ -438,6 +438,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/data_engineering/data_engineering.html b/docs/contents/core/data_engineering/data_engineering.html
index 0f98236..611ad14 100644
--- a/docs/contents/core/data_engineering/data_engineering.html
+++ b/docs/contents/core/data_engineering/data_engineering.html
@@ -488,6 +488,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/dl_primer/dl_primer.html b/docs/contents/core/dl_primer/dl_primer.html
index cb3b25b..d3eb409 100644
--- a/docs/contents/core/dl_primer/dl_primer.html
+++ b/docs/contents/core/dl_primer/dl_primer.html
@@ -488,6 +488,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/efficient_ai/efficient_ai.html b/docs/contents/core/efficient_ai/efficient_ai.html
index 67f8f67..d27e300 100644
--- a/docs/contents/core/efficient_ai/efficient_ai.html
+++ b/docs/contents/core/efficient_ai/efficient_ai.html
@@ -459,6 +459,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/frameworks/frameworks.html b/docs/contents/core/frameworks/frameworks.html
index 3bfef71..98879b0 100644
--- a/docs/contents/core/frameworks/frameworks.html
+++ b/docs/contents/core/frameworks/frameworks.html
@@ -522,6 +522,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
@@ -1610,7 +1616,7 @@ <h3 data-number="6.8.3" class="anchored" data-anchor-id="library"><span class="h
 <section id="choosing-the-right-framework" class="level2" data-number="6.9">
 <h2 data-number="6.9" class="anchored" data-anchor-id="choosing-the-right-framework"><span class="header-section-number">6.9</span> Choosing the Right Framework</h2>
 <p>Choosing the right machine learning framework for a given application requires carefully evaluating models, hardware, and software considerations. <a href="#fig-tf-comparison" class="quarto-xref">Figure&nbsp;<span>6.13</span></a> provides a comparison of different TensorFlow frameworks, which we’ll discuss in more detail:</p>
-<div id="fig-tf-comparison" class="quarto-float quarto-figure quarto-figure-center anchored" data-align="center" data-caption="TensorFlow Framework Comparison - General">
+<div id="fig-tf-comparison" class="quarto-float quarto-figure quarto-figure-center anchored" data-caption="TensorFlow Framework Comparison - General" data-align="center">
 <figure class="quarto-float quarto-float-fig figure">
 <div aria-describedby="fig-tf-comparison-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
 <a href="images/png/image4.png" class="lightbox" data-gallery="quarto-lightbox-gallery-14" title="Figure&nbsp;6.13: TensorFlow Framework Comparison - General. Source: TensorFlow."><img src="images/png/image4.png" style="width:100.0%" data-align="center" data-caption="TensorFlow Framework Comparison - General" class="figure-img"></a>
@@ -1630,7 +1636,7 @@ <h3 data-number="6.9.1" class="anchored" data-anchor-id="model"><span class="hea
 <h3 data-number="6.9.2" class="anchored" data-anchor-id="software"><span class="header-section-number">6.9.2</span> Software</h3>
 <p>As shown in <a href="#fig-tf-sw-comparison" class="quarto-xref">Figure&nbsp;<span>6.14</span></a>, TensorFlow Lite Micro does not have OS support, while TensorFlow and TensorFlow Lite do. This design choice for TensorFlow Lite Micro helps reduce memory overhead, make startup times faster, and consume less energy. Instead, TensorFlow Lite Micro can be used in conjunction with real-time operating systems (RTOS) like FreeRTOS, Zephyr, and Mbed OS.</p>
 <p>The figure also highlights an important memory management feature: TensorFlow Lite and TensorFlow Lite Micro support model memory mapping, allowing models to be directly accessed from flash storage rather than loaded into RAM. In contrast, TensorFlow does not offer this capability.</p>
-<div id="fig-tf-sw-comparison" class="quarto-float quarto-figure quarto-figure-center anchored" data-align="center" data-caption="TensorFlow Framework Comparison - Model">
+<div id="fig-tf-sw-comparison" class="quarto-float quarto-figure quarto-figure-center anchored" data-caption="TensorFlow Framework Comparison - Model" data-align="center">
 <figure class="quarto-float quarto-float-fig figure">
 <div aria-describedby="fig-tf-sw-comparison-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
 <a href="images/png/image5.png" class="lightbox" data-gallery="quarto-lightbox-gallery-15" title="Figure&nbsp;6.14: TensorFlow Framework Comparison - Software. Source: TensorFlow."><img src="images/png/image5.png" style="width:100.0%" data-align="center" data-caption="TensorFlow Framework Comparison - Model" class="figure-img"></a>
@@ -1646,7 +1652,7 @@ <h3 data-number="6.9.2" class="anchored" data-anchor-id="software"><span class="
 <section id="hardware" class="level3" data-number="6.9.3">
 <h3 data-number="6.9.3" class="anchored" data-anchor-id="hardware"><span class="header-section-number">6.9.3</span> Hardware</h3>
 <p>TensorFlow Lite and TensorFlow Lite Micro have significantly smaller base binary sizes and memory footprints than TensorFlow (see <a href="#fig-tf-hw-comparison" class="quarto-xref">Figure&nbsp;<span>6.15</span></a>). For example, a typical TensorFlow Lite Micro binary is less than 200KB, whereas TensorFlow is much larger. This is due to the resource-constrained environments of embedded systems. TensorFlow supports x86, TPUs, and GPUs like NVIDIA, AMD, and Intel.</p>
-<div id="fig-tf-hw-comparison" class="quarto-float quarto-figure quarto-figure-center anchored" data-align="center" data-caption="TensorFlow Framework Comparison - Hardware">
+<div id="fig-tf-hw-comparison" class="quarto-float quarto-figure quarto-figure-center anchored" data-caption="TensorFlow Framework Comparison - Hardware" data-align="center">
 <figure class="quarto-float quarto-float-fig figure">
 <div aria-describedby="fig-tf-hw-comparison-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
 <a href="images/png/image3.png" class="lightbox" data-gallery="quarto-lightbox-gallery-16" title="Figure&nbsp;6.15: TensorFlow Framework Comparison - Hardware. Source: TensorFlow."><img src="images/png/image3.png" style="width:100.0%" data-align="center" data-caption="TensorFlow Framework Comparison - Hardware" class="figure-img"></a>
@@ -1693,7 +1699,7 @@ <h2 data-number="6.10" class="anchored" data-anchor-id="future-trends-in-ml-fram
 <section id="decomposition" class="level3" data-number="6.10.1">
 <h3 data-number="6.10.1" class="anchored" data-anchor-id="decomposition"><span class="header-section-number">6.10.1</span> Decomposition</h3>
 <p>Currently, the ML system stack consists of four abstractions as shown in <a href="#fig-mlsys-stack" class="quarto-xref">Figure&nbsp;<span>6.16</span></a>, namely (1) computational graphs, (2) tensor programs, (3) libraries and runtimes, and (4) hardware primitives.</p>
-<div id="fig-mlsys-stack" class="quarto-float quarto-figure quarto-figure-center anchored" data-align="center" data-caption="Four Abstractions in Current ML System Stack">
+<div id="fig-mlsys-stack" class="quarto-float quarto-figure quarto-figure-center anchored" data-caption="Four Abstractions in Current ML System Stack" data-align="center">
 <figure class="quarto-float quarto-float-fig figure">
 <div aria-describedby="fig-mlsys-stack-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
 <a href="images/png/image8.png" class="lightbox" data-gallery="quarto-lightbox-gallery-17" title="Figure&nbsp;6.16: Four abstractions in current ML system stacks. Source: TVM."><img src="images/png/image8.png" class="img-fluid figure-img" data-align="center" data-caption="Four Abstractions in Current ML System Stack"></a>
diff --git a/docs/contents/core/generative_ai/generative_ai.html b/docs/contents/core/generative_ai/generative_ai.html
index 55a6118..3a8be38 100644
--- a/docs/contents/core/generative_ai/generative_ai.html
+++ b/docs/contents/core/generative_ai/generative_ai.html
@@ -436,6 +436,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/hw_acceleration/hw_acceleration.html b/docs/contents/core/hw_acceleration/hw_acceleration.html
index 0b7affb..f929e37 100644
--- a/docs/contents/core/hw_acceleration/hw_acceleration.html
+++ b/docs/contents/core/hw_acceleration/hw_acceleration.html
@@ -459,6 +459,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/introduction/introduction.html b/docs/contents/core/introduction/introduction.html
index af45b52..835e6d7 100644
--- a/docs/contents/core/introduction/introduction.html
+++ b/docs/contents/core/introduction/introduction.html
@@ -458,6 +458,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/ml_systems/ml_systems.html b/docs/contents/core/ml_systems/ml_systems.html
index fd9511d..cf1658d 100644
--- a/docs/contents/core/ml_systems/ml_systems.html
+++ b/docs/contents/core/ml_systems/ml_systems.html
@@ -459,6 +459,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/ondevice_learning/ondevice_learning.html b/docs/contents/core/ondevice_learning/ondevice_learning.html
index 1d891e9..0de985d 100644
--- a/docs/contents/core/ondevice_learning/ondevice_learning.html
+++ b/docs/contents/core/ondevice_learning/ondevice_learning.html
@@ -488,6 +488,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/ops/ops.html b/docs/contents/core/ops/ops.html
index e30a9fd..7ef7aa9 100644
--- a/docs/contents/core/ops/ops.html
+++ b/docs/contents/core/ops/ops.html
@@ -488,6 +488,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/optimizations/optimizations.html b/docs/contents/core/optimizations/optimizations.html
index 6d8f7ba..a2eb571 100644
--- a/docs/contents/core/optimizations/optimizations.html
+++ b/docs/contents/core/optimizations/optimizations.html
@@ -488,6 +488,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/privacy_security/privacy_security.html b/docs/contents/core/privacy_security/privacy_security.html
index dc5cead..b8e9422 100644
--- a/docs/contents/core/privacy_security/privacy_security.html
+++ b/docs/contents/core/privacy_security/privacy_security.html
@@ -488,6 +488,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/responsible_ai/responsible_ai.html b/docs/contents/core/responsible_ai/responsible_ai.html
index 96e81cb..924fbd6 100644
--- a/docs/contents/core/responsible_ai/responsible_ai.html
+++ b/docs/contents/core/responsible_ai/responsible_ai.html
@@ -488,6 +488,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/robust_ai/robust_ai.html b/docs/contents/core/robust_ai/robust_ai.html
index 829e7d5..d92cf07 100644
--- a/docs/contents/core/robust_ai/robust_ai.html
+++ b/docs/contents/core/robust_ai/robust_ai.html
@@ -459,6 +459,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/sustainable_ai/sustainable_ai.html b/docs/contents/core/sustainable_ai/sustainable_ai.html
index 24bfb0d..2d60c78 100644
--- a/docs/contents/core/sustainable_ai/sustainable_ai.html
+++ b/docs/contents/core/sustainable_ai/sustainable_ai.html
@@ -488,6 +488,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/training/training.html b/docs/contents/core/training/training.html
index 216eaf9..5185fd5 100644
--- a/docs/contents/core/training/training.html
+++ b/docs/contents/core/training/training.html
@@ -488,6 +488,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/core/workflow/workflow.html b/docs/contents/core/workflow/workflow.html
index 759dbad..dadb31f 100644
--- a/docs/contents/core/workflow/workflow.html
+++ b/docs/contents/core/workflow/workflow.html
@@ -439,6 +439,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/arduino/nicla_vision/image_classification/image_classification.html b/docs/contents/labs/arduino/nicla_vision/image_classification/image_classification.html
index 8f12042..8c5b1d4 100644
--- a/docs/contents/labs/arduino/nicla_vision/image_classification/image_classification.html
+++ b/docs/contents/labs/arduino/nicla_vision/image_classification/image_classification.html
@@ -473,6 +473,12 @@
   <a href="../../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/arduino/nicla_vision/kws/kws.html b/docs/contents/labs/arduino/nicla_vision/kws/kws.html
index 0216902..2fa3f3e 100644
--- a/docs/contents/labs/arduino/nicla_vision/kws/kws.html
+++ b/docs/contents/labs/arduino/nicla_vision/kws/kws.html
@@ -473,6 +473,12 @@
   <a href="../../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/arduino/nicla_vision/motion_classification/motion_classification.html b/docs/contents/labs/arduino/nicla_vision/motion_classification/motion_classification.html
index 6062910..68db2f5 100644
--- a/docs/contents/labs/arduino/nicla_vision/motion_classification/motion_classification.html
+++ b/docs/contents/labs/arduino/nicla_vision/motion_classification/motion_classification.html
@@ -502,6 +502,12 @@
   <a href="../../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/arduino/nicla_vision/nicla_vision.html b/docs/contents/labs/arduino/nicla_vision/nicla_vision.html
index 524de37..e7d9409 100644
--- a/docs/contents/labs/arduino/nicla_vision/nicla_vision.html
+++ b/docs/contents/labs/arduino/nicla_vision/nicla_vision.html
@@ -439,6 +439,12 @@
   <a href="../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/arduino/nicla_vision/object_detection/object_detection.html b/docs/contents/labs/arduino/nicla_vision/object_detection/object_detection.html
index 2d0b074..542d2b7 100644
--- a/docs/contents/labs/arduino/nicla_vision/object_detection/object_detection.html
+++ b/docs/contents/labs/arduino/nicla_vision/object_detection/object_detection.html
@@ -473,6 +473,12 @@
   <a href="../../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/arduino/nicla_vision/setup/setup.html b/docs/contents/labs/arduino/nicla_vision/setup/setup.html
index 12a0375..ed05ee5 100644
--- a/docs/contents/labs/arduino/nicla_vision/setup/setup.html
+++ b/docs/contents/labs/arduino/nicla_vision/setup/setup.html
@@ -473,6 +473,12 @@
   <a href="../../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/getting_started.html b/docs/contents/labs/getting_started.html
index 1561534..7b8e731 100644
--- a/docs/contents/labs/getting_started.html
+++ b/docs/contents/labs/getting_started.html
@@ -436,6 +436,12 @@
   <a href="../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/labs.html b/docs/contents/labs/labs.html
index c6484c8..6b64387 100644
--- a/docs/contents/labs/labs.html
+++ b/docs/contents/labs/labs.html
@@ -435,6 +435,12 @@
   <a href="../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/overview.html b/docs/contents/labs/overview.html
index f86846f..2a4d4c5 100644
--- a/docs/contents/labs/overview.html
+++ b/docs/contents/labs/overview.html
@@ -436,6 +436,12 @@
   <a href="../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
@@ -568,47 +574,53 @@ <h2 class="anchored" data-anchor-id="supported-devices">Supported Devices</h2>
 <thead>
 <tr class="header">
 <th style="text-align: left;">Exercise</th>
-<th style="text-align: left;"><a href="https://store.arduino.cc/products/nicla-vision">Nicla Vision</a></th>
-<th style="text-align: left;"><a href="https://wiki.seeedstudio.com/xiao_esp32s3_getting_started/">XIAO ESP32S3</a></th>
-<th style="text-align: left;"><a href="https://www.raspberrypi.com/">Raspberry Pi</a></th>
+<th style="text-align: center;"><a href="https://store.arduino.cc/products/nicla-vision">Nicla Vision</a></th>
+<th style="text-align: center;"><a href="https://wiki.seeedstudio.com/xiao_esp32s3_getting_started/">XIAO ESP32S3</a></th>
+<th style="text-align: center;"><a href="https://www.raspberrypi.com/">Raspberry Pi</a></th>
 </tr>
 </thead>
 <tbody>
 <tr class="odd">
 <td style="text-align: left;">Installation &amp; Setup</td>
-<td style="text-align: left;">✓</td>
-<td style="text-align: left;">✓</td>
-<td style="text-align: left;">✓</td>
+<td style="text-align: center;">✓</td>
+<td style="text-align: center;">✓</td>
+<td style="text-align: center;">✓</td>
 </tr>
 <tr class="even">
 <td style="text-align: left;">Keyword Spotting (KWS)</td>
-<td style="text-align: left;">✓</td>
-<td style="text-align: left;">✓</td>
-<td style="text-align: left;"></td>
+<td style="text-align: center;">✓</td>
+<td style="text-align: center;">✓</td>
+<td style="text-align: center;"></td>
 </tr>
 <tr class="odd">
 <td style="text-align: left;">Image Classification</td>
-<td style="text-align: left;">✓</td>
-<td style="text-align: left;">✓</td>
-<td style="text-align: left;">✓</td>
+<td style="text-align: center;">✓</td>
+<td style="text-align: center;">✓</td>
+<td style="text-align: center;">✓</td>
 </tr>
 <tr class="even">
 <td style="text-align: left;">Object Detection</td>
-<td style="text-align: left;">✓</td>
-<td style="text-align: left;">✓</td>
-<td style="text-align: left;">✓</td>
+<td style="text-align: center;">✓</td>
+<td style="text-align: center;">✓</td>
+<td style="text-align: center;">✓</td>
 </tr>
 <tr class="odd">
 <td style="text-align: left;">Motion Detection</td>
-<td style="text-align: left;">✓</td>
-<td style="text-align: left;">✓</td>
-<td style="text-align: left;"></td>
+<td style="text-align: center;">✓</td>
+<td style="text-align: center;">✓</td>
+<td style="text-align: center;"></td>
 </tr>
 <tr class="even">
 <td style="text-align: left;">Small Language Models (SLM)</td>
-<td style="text-align: left;"></td>
-<td style="text-align: left;"></td>
-<td style="text-align: left;">✓</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">✓</td>
+</tr>
+<tr class="odd">
+<td style="text-align: left;">Vision Language Models (VLM)</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">✓</td>
 </tr>
 </tbody>
 </table>
diff --git a/docs/contents/labs/raspi/image_classification/image_classification.html b/docs/contents/labs/raspi/image_classification/image_classification.html
index 6ed4fd1..90be305 100644
--- a/docs/contents/labs/raspi/image_classification/image_classification.html
+++ b/docs/contents/labs/raspi/image_classification/image_classification.html
@@ -473,6 +473,12 @@
   <a href="../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/raspi/llm/llm.html b/docs/contents/labs/raspi/llm/llm.html
index d695c73..06d3644 100644
--- a/docs/contents/labs/raspi/llm/llm.html
+++ b/docs/contents/labs/raspi/llm/llm.html
@@ -64,7 +64,7 @@
 <script src="../../../../site_libs/quarto-search/fuse.min.js"></script>
 <script src="../../../../site_libs/quarto-search/quarto-search.js"></script>
 <meta name="quarto:offset" content="../../../../">
-<link href="../../../../contents/labs/shared/shared.html" rel="next">
+<link href="../../../../contents/labs/raspi/vlm/vlm.html" rel="next">
 <link href="../../../../contents/labs/raspi/object_detection/object_detection.html" rel="prev">
 <link href="../../../../favicon.png" rel="icon" type="image/png">
 <script src="../../../../site_libs/quarto-html/quarto.js"></script>
@@ -473,6 +473,12 @@
   <a href="../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link active">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
@@ -2090,8 +2096,8 @@ <h2 class="anchored" data-anchor-id="resources">Resources</h2>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
-      <a href="../../../../contents/labs/shared/shared.html" class="pagination-link" aria-label="Shared Labs">
-        <span class="nav-page-text">Shared Labs</span> <i class="bi bi-arrow-right-short"></i>
+      <a href="../../../../contents/labs/raspi/vlm/vlm.html" class="pagination-link" aria-label="Vision-Language Models (VLM)">
+        <span class="nav-page-text">Vision-Language Models (VLM)</span> <i class="bi bi-arrow-right-short"></i>
       </a>
   </div>
 </nav>
diff --git a/docs/contents/labs/raspi/object_detection/object_detection.html b/docs/contents/labs/raspi/object_detection/object_detection.html
index 2ad7221..119cf85 100644
--- a/docs/contents/labs/raspi/object_detection/object_detection.html
+++ b/docs/contents/labs/raspi/object_detection/object_detection.html
@@ -473,6 +473,12 @@
   <a href="../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/raspi/raspi.html b/docs/contents/labs/raspi/raspi.html
index f43d11b..73a49d8 100644
--- a/docs/contents/labs/raspi/raspi.html
+++ b/docs/contents/labs/raspi/raspi.html
@@ -439,6 +439,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
@@ -531,7 +537,7 @@ <h1 class="title">Raspberry Pi</h1>
 <section id="pre-requisites" class="level2">
 <h2 class="anchored" data-anchor-id="pre-requisites">Pre-requisites</h2>
 <ul>
-<li><strong>Raspberry Pi</strong>: Ensure you have at least one of the boards: the Raspberry Pi Zero 2W, Raspberry Pi 4 or 5 for the Vision Labs, and the Raspberry 5 for the GenAi lab.</li>
+<li><strong>Raspberry Pi</strong>: Ensure you have at least one of the boards: the Raspberry Pi Zero 2W, Raspberry Pi 4 or 5 for the Vision Labs, and the Raspberry 5 for the GenAi labs.</li>
 <li><strong>Power Adapter</strong>: To Power on the boards.
 <ul>
 <li>Raspberry Pi Zero 2-W: 2.5W with a Micro-USB adapter</li>
@@ -552,36 +558,36 @@ <h2 class="anchored" data-anchor-id="exercises">Exercises</h2>
 <table class="caption-top table">
 <thead>
 <tr class="header">
-<th><strong>Modality</strong></th>
-<th><strong>Task</strong></th>
-<th><strong>Description</strong></th>
-<th><strong>Link</strong></th>
+<th style="text-align: left;"><strong>Modality</strong></th>
+<th style="text-align: left;"><strong>Task</strong></th>
+<th style="text-align: left;"><strong>Description</strong></th>
+<th style="text-align: left;"><strong>Link</strong></th>
 </tr>
 </thead>
 <tbody>
 <tr class="odd">
-<td>Vision</td>
-<td>Image Classification</td>
-<td>Learn to classify images</td>
-<td><a href="../../../contents/labs/raspi/image_classification/image_classification.html">Link</a></td>
+<td style="text-align: left;">Vision</td>
+<td style="text-align: left;">Image Classification</td>
+<td style="text-align: left;">Learn to classify images</td>
+<td style="text-align: left;"><a href="../../../contents/labs/raspi/image_classification/image_classification.html">Link</a></td>
 </tr>
 <tr class="even">
-<td>Vision</td>
-<td>Object Detection</td>
-<td>Implement object detection</td>
-<td><a href="../../../contents/labs/raspi/object_detection/object_detection.html">Link</a></td>
+<td style="text-align: left;">Vision</td>
+<td style="text-align: left;">Object Detection</td>
+<td style="text-align: left;">Implement object detection</td>
+<td style="text-align: left;"><a href="../../../contents/labs/raspi/object_detection/object_detection.html">Link</a></td>
 </tr>
 <tr class="odd">
-<td>GenAI</td>
-<td>Small Language Models</td>
-<td>Deploy SLMs at the Edge</td>
-<td><a href="../../../contents/labs/raspi/llm/llm.html">Link</a></td>
+<td style="text-align: left;">GenAI</td>
+<td style="text-align: left;">Small Language Models</td>
+<td style="text-align: left;">Deploy SLMs at the Edge</td>
+<td style="text-align: left;"><a href="../../../contents/labs/raspi/llm/llm.html">Link</a></td>
 </tr>
 <tr class="even">
-<td></td>
-<td></td>
-<td></td>
-<td></td>
+<td style="text-align: left;">GenAI</td>
+<td style="text-align: left;">Visual-Language Models</td>
+<td style="text-align: left;">Deploy VLMs at the Edge</td>
+<td style="text-align: left;"><a href="../../../contents/labs/raspi/vlm/vlm.html">Link</a></td>
 </tr>
 </tbody>
 </table>
diff --git a/docs/contents/labs/raspi/setup/setup.html b/docs/contents/labs/raspi/setup/setup.html
index 9d576f4..f4d53a7 100644
--- a/docs/contents/labs/raspi/setup/setup.html
+++ b/docs/contents/labs/raspi/setup/setup.html
@@ -473,6 +473,12 @@
   <a href="../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/raspi/vlm/images/jpeg/cover.jpg b/docs/contents/labs/raspi/vlm/images/jpeg/cover.jpg
new file mode 100644
index 0000000..5d64e4d
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/jpeg/cover.jpg differ
diff --git a/docs/contents/labs/raspi/vlm/images/jpeg/florence-diagr.jpg b/docs/contents/labs/raspi/vlm/images/jpeg/florence-diagr.jpg
new file mode 100644
index 0000000..fbcfd24
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/jpeg/florence-diagr.jpg differ
diff --git a/docs/contents/labs/raspi/vlm/images/jpeg/raspi5-active-cooler.jpg b/docs/contents/labs/raspi/vlm/images/jpeg/raspi5-active-cooler.jpg
new file mode 100644
index 0000000..2c69812
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/jpeg/raspi5-active-cooler.jpg differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/Dataset_FLD-5B.png b/docs/contents/labs/raspi/vlm/images/png/Dataset_FLD-5B.png
new file mode 100644
index 0000000..2f6bf8d
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/Dataset_FLD-5B.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/arch.png b/docs/contents/labs/raspi/vlm/images/png/arch.png
new file mode 100644
index 0000000..f9bb96a
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/arch.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/caption_ground.png b/docs/contents/labs/raspi/vlm/images/png/caption_ground.png
new file mode 100644
index 0000000..42c1a60
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/caption_ground.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/cascade.png b/docs/contents/labs/raspi/vlm/images/png/cascade.png
new file mode 100644
index 0000000..6bc0a43
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/cascade.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/dog_bottle_seg.png b/docs/contents/labs/raspi/vlm/images/png/dog_bottle_seg.png
new file mode 100644
index 0000000..ae8e796
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/dog_bottle_seg.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/fine-tuning.png b/docs/contents/labs/raspi/vlm/images/png/fine-tuning.png
new file mode 100644
index 0000000..900afff
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/fine-tuning.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/flyer.png b/docs/contents/labs/raspi/vlm/images/png/flyer.png
new file mode 100644
index 0000000..383afaf
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/flyer.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/fusca.png b/docs/contents/labs/raspi/vlm/images/png/fusca.png
new file mode 100644
index 0000000..5587651
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/fusca.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/htop.png b/docs/contents/labs/raspi/vlm/images/png/htop.png
new file mode 100644
index 0000000..46f506d
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/htop.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/img_test.png b/docs/contents/labs/raspi/vlm/images/png/img_test.png
new file mode 100644
index 0000000..2b39fbb
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/img_test.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/jupyter.png b/docs/contents/labs/raspi/vlm/images/png/jupyter.png
new file mode 100644
index 0000000..0f3e08e
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/jupyter.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/od_caption_cat_dog_table.png b/docs/contents/labs/raspi/vlm/images/png/od_caption_cat_dog_table.png
new file mode 100644
index 0000000..6f1c6e1
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/od_caption_cat_dog_table.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/od_cat_dogs.png b/docs/contents/labs/raspi/vlm/images/png/od_cat_dogs.png
new file mode 100644
index 0000000..ebeb9df
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/od_cat_dogs.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/od_table.png b/docs/contents/labs/raspi/vlm/images/png/od_table.png
new file mode 100644
index 0000000..bf6f017
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/od_table.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/open_vacab_exemples.png b/docs/contents/labs/raspi/vlm/images/png/open_vacab_exemples.png
new file mode 100644
index 0000000..30e18f8
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/open_vacab_exemples.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/orange_seg.png b/docs/contents/labs/raspi/vlm/images/png/orange_seg.png
new file mode 100644
index 0000000..7e64c23
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/orange_seg.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/output_ocr.png b/docs/contents/labs/raspi/vlm/images/png/output_ocr.png
new file mode 100644
index 0000000..2e4b719
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/output_ocr.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/result_test.png b/docs/contents/labs/raspi/vlm/images/png/result_test.png
new file mode 100644
index 0000000..771f1f2
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/result_test.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/ssh.png b/docs/contents/labs/raspi/vlm/images/png/ssh.png
new file mode 100644
index 0000000..68362be
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/ssh.png differ
diff --git a/docs/contents/labs/raspi/vlm/images/png/test-pytorch.png b/docs/contents/labs/raspi/vlm/images/png/test-pytorch.png
new file mode 100644
index 0000000..4f1aaf6
Binary files /dev/null and b/docs/contents/labs/raspi/vlm/images/png/test-pytorch.png differ
diff --git a/docs/contents/labs/raspi/vlm/vlm.html b/docs/contents/labs/raspi/vlm/vlm.html
new file mode 100644
index 0000000..ef677e6
--- /dev/null
+++ b/docs/contents/labs/raspi/vlm/vlm.html
@@ -0,0 +1,2216 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.6.39">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>Vision-Language Models (VLM) – Machine Learning Systems</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="../../../../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../../../../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../../../../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../../../../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../../../../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../../../../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../../../../">
+<link href="../../../../contents/labs/shared/shared.html" rel="next">
+<link href="../../../../contents/labs/raspi/llm/llm.html" rel="prev">
+<link href="../../../../favicon.png" rel="icon" type="image/png">
+<script src="../../../../site_libs/quarto-html/quarto.js"></script>
+<script src="../../../../site_libs/quarto-html/popper.min.js"></script>
+<script src="../../../../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../../../../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../../../../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../../../../site_libs/quarto-html/quarto-syntax-highlighting-0626ff4d7a71b55c8707dcae1d04a9b6.css" rel="stylesheet" class="quarto-color-scheme" id="quarto-text-highlighting-styles">
+<link href="../../../../site_libs/quarto-html/quarto-syntax-highlighting-dark-e25cc82807363e79c52e96bc8b7855f1.css" rel="prefetch" class="quarto-color-scheme quarto-color-alternate" id="quarto-text-highlighting-styles">
+<script src="../../../../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../../../../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../../../../site_libs/bootstrap/bootstrap-f6fd887def8f16230bbd81cbe56940db.min.css" rel="stylesheet" append-hash="true" class="quarto-color-scheme" id="quarto-bootstrap" data-mode="light">
+<link href="../../../../site_libs/bootstrap/bootstrap-dark-5d5751fd1b0cd246fabab00cb4eb0772.min.css" rel="prefetch" append-hash="true" class="quarto-color-scheme quarto-color-alternate" id="quarto-bootstrap" data-mode="dark">
+<script src="../../../../site_libs/quarto-contrib/glightbox/glightbox.min.js"></script>
+<link href="../../../../site_libs/quarto-contrib/glightbox/glightbox.min.css" rel="stylesheet">
+<link href="../../../../site_libs/quarto-contrib/glightbox/lightbox.css" rel="stylesheet">
+<script id="quarto-search-options" type="application/json">{
+  "location": "sidebar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "start",
+  "type": "textbox",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "~",
+    "/"
+  ],
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-M21L0CBCVN"></script>
+
+<script type="text/javascript">
+
+window.dataLayer = window.dataLayer || [];
+function gtag(){dataLayer.push(arguments);}
+gtag('js', new Date());
+gtag('config', 'G-M21L0CBCVN', { 'anonymize_ip': true});
+</script>
+<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-M21L0CBCVN"></script>
+<script type="module" src="../../../../scripts/ai_menu/dist/bundle.js" defer=""></script>
+
+
+</head>
+
+<body class="nav-sidebar floating nav-fixed">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+    <nav class="navbar navbar-expand-lg " data-bs-theme="dark">
+      <div class="navbar-container container-fluid">
+      <div class="navbar-brand-container mx-auto">
+    <a class="navbar-brand" href="../../../../index.html">
+    <span class="navbar-title">Machine Learning Systems</span>
+    </a>
+  </div>
+        <div class="quarto-navbar-tools tools-wide tools-end">
+    <a href="https://github.com/harvard-edge/cs249r_book" title="Source Code" class="quarto-navigation-tool px-1" aria-label="Source Code"><i class="bi bi-github"></i></a>
+    <a href="../../../../Machine-Learning-Systems.pdf" title="Download PDF" class="quarto-navigation-tool px-1" aria-label="Download PDF"><i class="bi bi-file-pdf"></i></a>
+    <div class="dropdown">
+      <a href="" title="Share" id="quarto-navigation-tool-dropdown-0" class="quarto-navigation-tool dropdown-toggle px-1" data-bs-toggle="dropdown" aria-expanded="false" role="link" aria-label="Share"><i class="bi bi-share"></i></a>
+      <ul class="dropdown-menu dropdown-menu-end" aria-labelledby="quarto-navigation-tool-dropdown-0">
+          <li>
+            <a class="dropdown-item quarto-navbar-tools-item" href="https://twitter.com/intent/tweet?url=|url|">
+              <i class="bi bi-twitter pe-1"></i>
+            Twitter
+            </a>
+          </li>
+          <li>
+            <a class="dropdown-item quarto-navbar-tools-item" href="https://www.facebook.com/sharer/sharer.php?u=|url|">
+              <i class="bi bi-facebook pe-1"></i>
+            Facebook
+            </a>
+          </li>
+      </ul>
+    </div>
+  <a href="" class="quarto-color-scheme-toggle quarto-navigation-tool  px-1" onclick="window.quartoToggleColorScheme(); return false;" title="Toggle dark mode"><i class="bi"></i></a>
+  <a href="" class="quarto-reader-toggle quarto-navigation-tool px-1" onclick="window.quartoToggleReader(); return false;" title="Toggle reader mode">
+  <div class="quarto-reader-toggle-btn">
+  <i class="bi"></i>
+  </div>
+</a>
+</div>
+      </div> <!-- /container-fluid -->
+    </nav>
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../../../../contents/labs/raspi/raspi.html">Raspberry Pi</a></li><li class="breadcrumb-item"><a href="../../../../contents/labs/raspi/vlm/vlm.html">Vision-Language Models (VLM)</a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+      <button type="button" class="btn quarto-search-button" aria-label="Search" onclick="window.quartoOpenSearch();">
+        <i class="bi bi-search"></i>
+      </button>
+    </div>
+  </nav>
+  <div id="quarto-announcement" data-announcement-id="6615aab77884bf55a802171a73978992" class="alert alert-primary hidden"><i class="bi bi-star-half quarto-announcement-icon"></i><div class="quarto-announcement-content">
+<p>⭐ [Oct 18] <b>We Hit 1,000 GitHub Stars</b> 🎉 Thanks to you, Arduino and SEEED donated AI hardware kits for <a href="https://tinyml.seas.harvard.edu/4D/pastEvents">TinyML workshops</a> in developing nations! <br> 🎓 [Nov 15] The <a href="https://www.edgeaifoundation.org/">EDGE AI Foundation</a> is <strong>matching academic scholarship funds</strong> for every new GitHub ⭐ (up to 10,000 stars). <a href="https://github.com/harvard-edge/cs249r_book">Click here to show support!</a> 🙏 <br> 🚀 <b>Our mission. 1 ⭐ = 1 👩‍🎓 Learner</b>. Every star tells a story: learners gaining knowledge and supporters driving the mission. Together, we’re making a difference.</p>
+</div><i class="bi bi-x-lg quarto-announcement-action"></i></div>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
+        <div class="mt-2 flex-shrink-0 align-items-center">
+        <div class="sidebar-search">
+        <div id="quarto-search" class="" title="Search"></div>
+        </div>
+        </div>
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="px-0"><hr class="sidebar-divider hi "></li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Preface</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/acknowledgements/acknowledgements.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Acknowledgements</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/about/about.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">About the Book</span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/ai/socratiq.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">SocratiQ AI</span></a>
+  </div>
+</li>
+        <li class="px-0"><hr class="sidebar-divider hi "></li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/introduction/introduction.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">1</span>&nbsp; <span class="chapter-title">Introduction</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/ml_systems/ml_systems.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">2</span>&nbsp; <span class="chapter-title">ML Systems</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/dl_primer/dl_primer.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">3</span>&nbsp; <span class="chapter-title">DL Primer</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/workflow/workflow.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">4</span>&nbsp; <span class="chapter-title">AI Workflow</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/data_engineering/data_engineering.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">5</span>&nbsp; <span class="chapter-title">Data Engineering</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/frameworks/frameworks.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">6</span>&nbsp; <span class="chapter-title">AI Frameworks</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/training/training.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">7</span>&nbsp; <span class="chapter-title">AI Training</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/efficient_ai/efficient_ai.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">8</span>&nbsp; <span class="chapter-title">Efficient AI</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/optimizations/optimizations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">9</span>&nbsp; <span class="chapter-title">Model Optimizations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/hw_acceleration/hw_acceleration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">10</span>&nbsp; <span class="chapter-title">AI Acceleration</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/benchmarking/benchmarking.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">11</span>&nbsp; <span class="chapter-title">Benchmarking AI</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/ondevice_learning/ondevice_learning.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">12</span>&nbsp; <span class="chapter-title">On-Device Learning</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/ops/ops.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">13</span>&nbsp; <span class="chapter-title">ML Operations</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/privacy_security/privacy_security.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">14</span>&nbsp; <span class="chapter-title">Security &amp; Privacy</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/responsible_ai/responsible_ai.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">15</span>&nbsp; <span class="chapter-title">Responsible AI</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/sustainable_ai/sustainable_ai.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">16</span>&nbsp; <span class="chapter-title">Sustainable AI</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/robust_ai/robust_ai.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">17</span>&nbsp; <span class="chapter-title">Robust AI</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/generative_ai/generative_ai.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">18</span>&nbsp; <span class="chapter-title">Generative AI</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/ai_for_good/ai_for_good.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">19</span>&nbsp; <span class="chapter-title">AI for Good</span></span></a>
+  </div>
+</li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/core/conclusion/conclusion.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text"><span class="chapter-number">20</span>&nbsp; <span class="chapter-title">Conclusion</span></span></a>
+  </div>
+</li>
+        <li class="px-0"><hr class="sidebar-divider hi "></li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../../../../contents/labs/labs.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">LABS</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/overview.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Overview</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/getting_started.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Getting Started</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../../../../contents/labs/arduino/nicla_vision/nicla_vision.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Nicla Vision</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/arduino/nicla_vision/setup/setup.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Setup</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/arduino/nicla_vision/image_classification/image_classification.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Image Classification</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/arduino/nicla_vision/object_detection/object_detection.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Object Detection</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/arduino/nicla_vision/kws/kws.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Keyword Spotting (KWS)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/arduino/nicla_vision/motion_classification/motion_classification.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Motion Classification and Anomaly Detection</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../../../../contents/labs/seeed/xiao_esp32s3/xiao_esp32s3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">XIAO ESP32S3</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/seeed/xiao_esp32s3/setup/setup.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Setup</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/seeed/xiao_esp32s3/image_classification/image_classification.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Image Classification</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/seeed/xiao_esp32s3/object_detection/object_detection.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Object Detection</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/seeed/xiao_esp32s3/kws/kws.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Keyword Spotting (KWS)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/seeed/xiao_esp32s3/motion_classification/motion_classification.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Motion Classification and Anomaly Detection</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../../../../contents/labs/raspi/raspi.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Raspberry Pi</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/setup/setup.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Setup</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/image_classification/image_classification.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Image Classification</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/object_detection/object_detection.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Object Detection</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Small Language Models (SLM)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../../../../contents/labs/shared/shared.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Shared Labs</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/shared/kws_feature_eng/kws_feature_eng.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">KWS Feature Engineering</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/shared/dsp_spectral_features_block/dsp_spectral_features_block.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">DSP Spectral Features</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="px-0"><hr class="sidebar-divider hi "></li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
+ <span class="menu-text">REFERENCES</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../references.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">References</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul>
+  <li><a href="#introduction" id="toc-introduction" class="nav-link active" data-scroll-target="#introduction">Introduction</a>
+  <ul>
+  <li><a href="#why-florence-2-at-the-edge" id="toc-why-florence-2-at-the-edge" class="nav-link" data-scroll-target="#why-florence-2-at-the-edge">Why Florence-2 at the Edge?</a></li>
+  <li><a href="#florence-2-model-architecture" id="toc-florence-2-model-architecture" class="nav-link" data-scroll-target="#florence-2-model-architecture">Florence-2 Model Architecture</a></li>
+  </ul></li>
+  <li><a href="#technical-overview" id="toc-technical-overview" class="nav-link" data-scroll-target="#technical-overview">Technical Overview</a>
+  <ul>
+  <li><a href="#architecture" id="toc-architecture" class="nav-link" data-scroll-target="#architecture">Architecture</a></li>
+  <li><a href="#training-dataset-fld-5b" id="toc-training-dataset-fld-5b" class="nav-link" data-scroll-target="#training-dataset-fld-5b">Training Dataset (FLD-5B)</a></li>
+  <li><a href="#key-capabilities" id="toc-key-capabilities" class="nav-link" data-scroll-target="#key-capabilities">Key Capabilities</a>
+  <ul class="collapse">
+  <li><a href="#zero-shot-performance" id="toc-zero-shot-performance" class="nav-link" data-scroll-target="#zero-shot-performance">Zero-shot Performance</a></li>
+  <li><a href="#fine-tuned-performance" id="toc-fine-tuned-performance" class="nav-link" data-scroll-target="#fine-tuned-performance">Fine-tuned Performance</a></li>
+  </ul></li>
+  <li><a href="#practical-applications" id="toc-practical-applications" class="nav-link" data-scroll-target="#practical-applications">Practical Applications</a></li>
+  <li><a href="#comparing-florence-2-with-other-vlms" id="toc-comparing-florence-2-with-other-vlms" class="nav-link" data-scroll-target="#comparing-florence-2-with-other-vlms">Comparing Florence-2 with other VLMs</a></li>
+  </ul></li>
+  <li><a href="#setup-and-installation" id="toc-setup-and-installation" class="nav-link" data-scroll-target="#setup-and-installation">Setup and Installation</a>
+  <ul>
+  <li><a href="#environment-configuration" id="toc-environment-configuration" class="nav-link" data-scroll-target="#environment-configuration">Environment configuration</a></li>
+  <li><a href="#testing-the-installation" id="toc-testing-the-installation" class="nav-link" data-scroll-target="#testing-the-installation">Testing the installation</a>
+  <ul class="collapse">
+  <li><a href="#importing-required-libraries" id="toc-importing-required-libraries" class="nav-link" data-scroll-target="#importing-required-libraries"><strong>1. Importing Required Libraries</strong></a></li>
+  <li><a href="#determining-the-device-and-data-type" id="toc-determining-the-device-and-data-type" class="nav-link" data-scroll-target="#determining-the-device-and-data-type"><strong>2. Determining the Device and Data Type</strong></a></li>
+  <li><a href="#loading-the-model-and-processor" id="toc-loading-the-model-and-processor" class="nav-link" data-scroll-target="#loading-the-model-and-processor"><strong>3. Loading the Model and Processor</strong></a></li>
+  </ul></li>
+  <li><a href="#defining-the-prompt" id="toc-defining-the-prompt" class="nav-link" data-scroll-target="#defining-the-prompt"><strong>4. Defining the Prompt</strong></a>
+  <ul class="collapse">
+  <li><a href="#downloading-and-loading-the-image" id="toc-downloading-and-loading-the-image" class="nav-link" data-scroll-target="#downloading-and-loading-the-image"><strong>5. Downloading and Loading the Image</strong></a></li>
+  <li><a href="#processing-inputs" id="toc-processing-inputs" class="nav-link" data-scroll-target="#processing-inputs"><strong>6. Processing Inputs</strong></a></li>
+  </ul></li>
+  <li><a href="#generating-the-output" id="toc-generating-the-output" class="nav-link" data-scroll-target="#generating-the-output"><strong>7. Generating the Output</strong></a>
+  <ul class="collapse">
+  <li><a href="#decoding-the-generated-text" id="toc-decoding-the-generated-text" class="nav-link" data-scroll-target="#decoding-the-generated-text"><strong>8. Decoding the Generated Text</strong></a></li>
+  <li><a href="#post-processing-the-generation" id="toc-post-processing-the-generation" class="nav-link" data-scroll-target="#post-processing-the-generation"><strong>9. Post-processing the Generation</strong></a></li>
+  <li><a href="#printing-the-output" id="toc-printing-the-output" class="nav-link" data-scroll-target="#printing-the-output"><strong>10. Printing the Output</strong></a></li>
+  <li><a href="#result" id="toc-result" class="nav-link" data-scroll-target="#result"><strong>Result</strong></a></li>
+  </ul></li>
+  </ul></li>
+  <li><a href="#florence-2-tasks" id="toc-florence-2-tasks" class="nav-link" data-scroll-target="#florence-2-tasks">Florence-2 Tasks</a>
+  <ul>
+  <li><a href="#object-detection-od" id="toc-object-detection-od" class="nav-link" data-scroll-target="#object-detection-od">1. <strong>Object Detection (OD)</strong></a></li>
+  <li><a href="#image-captioning" id="toc-image-captioning" class="nav-link" data-scroll-target="#image-captioning">2. <strong>Image Captioning</strong></a></li>
+  <li><a href="#detailed-captioning" id="toc-detailed-captioning" class="nav-link" data-scroll-target="#detailed-captioning">3. <strong>Detailed Captioning</strong></a></li>
+  <li><a href="#visual-grounding" id="toc-visual-grounding" class="nav-link" data-scroll-target="#visual-grounding">4. <strong>Visual Grounding</strong></a></li>
+  <li><a href="#segmentation" id="toc-segmentation" class="nav-link" data-scroll-target="#segmentation">5. <strong>Segmentation</strong></a></li>
+  <li><a href="#dense-region-captioning" id="toc-dense-region-captioning" class="nav-link" data-scroll-target="#dense-region-captioning">6. <strong>Dense Region Captioning</strong></a></li>
+  <li><a href="#ocr-with-region" id="toc-ocr-with-region" class="nav-link" data-scroll-target="#ocr-with-region">7. <strong>OCR with Region</strong></a></li>
+  <li><a href="#phrase-grounding-for-specific-expressions" id="toc-phrase-grounding-for-specific-expressions" class="nav-link" data-scroll-target="#phrase-grounding-for-specific-expressions">8. <strong>Phrase Grounding for Specific Expressions</strong></a></li>
+  <li><a href="#open-vocabulary-object-detection" id="toc-open-vocabulary-object-detection" class="nav-link" data-scroll-target="#open-vocabulary-object-detection">9. <strong>Open Vocabulary Object Detection</strong></a></li>
+  </ul></li>
+  <li><a href="#exploring-computer-vision-and-vision-language-tasks" id="toc-exploring-computer-vision-and-vision-language-tasks" class="nav-link" data-scroll-target="#exploring-computer-vision-and-vision-language-tasks">Exploring computer vision and vision-language tasks</a>
+  <ul>
+  <li><a href="#caption" id="toc-caption" class="nav-link" data-scroll-target="#caption">Caption</a></li>
+  <li><a href="#detailed_caption" id="toc-detailed_caption" class="nav-link" data-scroll-target="#detailed_caption">DETAILED_CAPTION</a></li>
+  <li><a href="#more_detailed_caption" id="toc-more_detailed_caption" class="nav-link" data-scroll-target="#more_detailed_caption">MORE_DETAILED_CAPTION</a></li>
+  <li><a href="#od---object-detection" id="toc-od---object-detection" class="nav-link" data-scroll-target="#od---object-detection">OD - Object Detection</a></li>
+  <li><a href="#dense_region_caption" id="toc-dense_region_caption" class="nav-link" data-scroll-target="#dense_region_caption">DENSE_REGION_CAPTION</a></li>
+  <li><a href="#caption_to_phrase_grounding" id="toc-caption_to_phrase_grounding" class="nav-link" data-scroll-target="#caption_to_phrase_grounding">CAPTION_TO_PHRASE_GROUNDING</a></li>
+  <li><a href="#cascade-tasks" id="toc-cascade-tasks" class="nav-link" data-scroll-target="#cascade-tasks">Cascade Tasks</a></li>
+  <li><a href="#open_vocabulary_detection" id="toc-open_vocabulary_detection" class="nav-link" data-scroll-target="#open_vocabulary_detection">OPEN_VOCABULARY_DETECTION</a></li>
+  <li><a href="#referring-expression-segmentation" id="toc-referring-expression-segmentation" class="nav-link" data-scroll-target="#referring-expression-segmentation">Referring expression segmentation</a></li>
+  <li><a href="#region-to-segmentation" id="toc-region-to-segmentation" class="nav-link" data-scroll-target="#region-to-segmentation">Region to Segmentation</a></li>
+  <li><a href="#region-to-texts" id="toc-region-to-texts" class="nav-link" data-scroll-target="#region-to-texts">Region to Texts</a></li>
+  <li><a href="#ocr" id="toc-ocr" class="nav-link" data-scroll-target="#ocr">OCR</a></li>
+  </ul></li>
+  <li><a href="#latency-summary" id="toc-latency-summary" class="nav-link" data-scroll-target="#latency-summary">Latency Summary</a></li>
+  <li><a href="#fine-tunning" id="toc-fine-tunning" class="nav-link" data-scroll-target="#fine-tunning">Fine-Tunning</a></li>
+  <li><a href="#conclusion" id="toc-conclusion" class="nav-link" data-scroll-target="#conclusion">Conclusion</a>
+  <ul>
+  <li><a href="#key-advantages-of-florence-2" id="toc-key-advantages-of-florence-2" class="nav-link" data-scroll-target="#key-advantages-of-florence-2">Key Advantages of Florence-2</a></li>
+  <li><a href="#trade-offs" id="toc-trade-offs" class="nav-link" data-scroll-target="#trade-offs">Trade-offs</a></li>
+  <li><a href="#best-use-cases" id="toc-best-use-cases" class="nav-link" data-scroll-target="#best-use-cases">Best Use Cases</a></li>
+  </ul></li>
+  <li><a href="#future-implications" id="toc-future-implications" class="nav-link" data-scroll-target="#future-implications">Future Implications</a></li>
+  <li><a href="#resources" id="toc-resources" class="nav-link" data-scroll-target="#resources">Resources</a></li>
+  </ul>
+<div class="toc-actions"><ul><li><a href="https://github.com/harvard-edge/cs249r_book/edit/dev/contents/labs/raspi/vlm/vlm.qmd" class="toc-action"><i class="bi bi-github"></i>Edit this page</a></li><li><a href="https://github.com/harvard-edge/cs249r_book/issues/new" class="toc-action"><i class="bi empty"></i>Report an issue</a></li><li><a href="https://github.com/harvard-edge/cs249r_book/blob/dev/contents/labs/raspi/vlm/vlm.qmd" class="toc-action"><i class="bi empty"></i>View source</a></li></ul></div></nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../../../../contents/labs/raspi/raspi.html">Raspberry Pi</a></li><li class="breadcrumb-item"><a href="../../../../contents/labs/raspi/vlm/vlm.html">Vision-Language Models (VLM)</a></li></ol></nav>
+<div class="quarto-title">
+<h1 class="title">Vision-Language Models (VLM)</h1>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure">
+<p><a href="images/jpeg/cover.jpg" class="lightbox" data-gallery="quarto-lightbox-gallery-1" title="DALL·E prompt - A Raspberry Pi setup featuring vision tasks. The image shows a Raspberry Pi connected to a camera, with various computer vision tasks displayed visually around it, including object detection, image captioning, segmentation, and visual grounding. The Raspberry Pi is placed on a desk, with a display showing bounding boxes and annotations related to these tasks. The background should be a home workspace, with tools and devices typically used by developers and hobbyists."><img src="images/jpeg/cover.jpg" class="img-fluid figure-img" alt="DALL·E prompt - A Raspberry Pi setup featuring vision tasks. The image shows a Raspberry Pi connected to a camera, with various computer vision tasks displayed visually around it, including object detection, image captioning, segmentation, and visual grounding. The Raspberry Pi is placed on a desk, with a display showing bounding boxes and annotations related to these tasks. The background should be a home workspace, with tools and devices typically used by developers and hobbyists."></a></p>
+<figcaption><em>DALL·E prompt - A Raspberry Pi setup featuring vision tasks. The image shows a Raspberry Pi connected to a camera, with various computer vision tasks displayed visually around it, including object detection, image captioning, segmentation, and visual grounding. The Raspberry Pi is placed on a desk, with a display showing bounding boxes and annotations related to these tasks. The background should be a home workspace, with tools and devices typically used by developers and hobbyists.</em></figcaption>
+</figure>
+</div>
+<section id="introduction" class="level2">
+<h2 class="anchored" data-anchor-id="introduction">Introduction</h2>
+<p>In this hands-on lab, we will continuously explore AI applications at the Edge, from the basic setup of the Florence-2, Microsoft’s state-of-the-art vision foundation model, to advanced implementations on devices like the Raspberry Pi. We will learn to use Vision-Languageor Models (VLMs) for tasks such as captioning, object detection, grounding, segmentation, and OCR on a Raspberry Pi.</p>
+<section id="why-florence-2-at-the-edge" class="level3">
+<h3 class="anchored" data-anchor-id="why-florence-2-at-the-edge">Why Florence-2 at the Edge?</h3>
+<p><a href="https://arxiv.org/abs/2311.06242">Florence-2</a> is a vision-language model open-sourced by Microsoft under the MIT license, which significantly advances vision-language models by combining a lightweight architecture with robust capabilities. Thanks to its training on the massive FLD-5B dataset, which contains 126 million images and 5.4 billion visual annotations, it achieves performance comparable to larger models. This makes Florence-2 ideal for deployment at the edge, where power and computational resources are limited.</p>
+<p>In this tutorial, we will explore how to use Florence-2 for real-time computer vision applications, such as:</p>
+<ul>
+<li>Image captioning</li>
+<li>Object detection</li>
+<li>Segmentation</li>
+<li>Visual grounding</li>
+</ul>
+<blockquote class="blockquote">
+<p><strong>Visual grounding</strong> involves linking textual descriptions to specific regions within an image. This enables the model to understand where particular objects or entities described in a prompt are in the image. For example, if the prompt is “a red car,” the model will identify and highlight the region where the red car is found in the image. Visual grounding is helpful for applications where precise alignment between text and visual content is needed, such as human-computer interaction, image annotation, and interactive AI systems.</p>
+</blockquote>
+<p>In the tutorial, we will walk through:</p>
+<ul>
+<li>Setting up Florence-2 on the Raspberry Pi</li>
+<li>Running inference tasks such as object detection and captioning</li>
+<li>Optimizing the model to get the best performance from the edge device</li>
+<li>Exploring practical, real-world applications with fine-tuning.</li>
+</ul>
+</section>
+<section id="florence-2-model-architecture" class="level3">
+<h3 class="anchored" data-anchor-id="florence-2-model-architecture">Florence-2 Model Architecture</h3>
+<p>Florence-2 utilizes a unified, prompt-based representation to handle various vision-language tasks. The model architecture consists of two main components: an <strong>image encoder</strong> and a <strong>multi-modal transformer encoder-decoder</strong>.</p>
+<p><a href="images/jpeg/florence-diagr.jpg" class="lightbox" data-gallery="quarto-lightbox-gallery-2"><img src="images/jpeg/florence-diagr.jpg" class="img-fluid"></a></p>
+<ul>
+<li><p><strong>Image Encoder</strong>: The image encoder is based on the <a href="https://arxiv.org/abs/2204.03645">DaViT (Dual Attention Vision Transformers) architecture</a>. It converts input images into a series of visual token embeddings. These embeddings serve as the foundational representations of the visual content, capturing both spatial and contextual information about the image.</p></li>
+<li><p><strong>Multi-Modal Transformer Encoder-Decoder</strong>: Florence-2’s core is the multi-modal transformer encoder-decoder, which combines visual token embeddings from the image encoder with textual embeddings generated by a BERT-like model. This combination allows the model to simultaneously process visual and textual inputs, enabling a unified approach to tasks such as image captioning, object detection, and segmentation.</p></li>
+</ul>
+<p>The model’s training on the extensive FLD-5B dataset ensures it can effectively handle diverse vision tasks without requiring task-specific modifications. Florence-2 uses textual prompts to activate specific tasks, making it highly flexible and capable of zero-shot generalization. For tasks like object detection or visual grounding, the model incorporates additional location tokens to represent regions within the image, ensuring a precise understanding of spatial relationships.</p>
+<blockquote class="blockquote">
+<p>Florence-2’s compact architecture and innovative training approach allow it to perform computer vision tasks accurately, even on resource-constrained devices like the Raspberry Pi.</p>
+</blockquote>
+</section>
+</section>
+<section id="technical-overview" class="level2">
+<h2 class="anchored" data-anchor-id="technical-overview">Technical Overview</h2>
+<p>Florence-2 introduces several innovative features that set it apart:</p>
+<section id="architecture" class="level3">
+<h3 class="anchored" data-anchor-id="architecture">Architecture</h3>
+<p><a href="images/png/arch.png" class="lightbox" data-gallery="quarto-lightbox-gallery-3"><img src="images/png/arch.png" class="img-fluid"></a></p>
+<ul>
+<li><strong>Lightweight Design</strong>: Two variants available
+<ul>
+<li>Florence-2-Base: 232 million parameters</li>
+<li>Florence-2-Large: 771 million parameters</li>
+</ul></li>
+<li><strong>Unified Representation</strong>: Handles multiple vision tasks through a single architecture</li>
+<li><strong>DaViT Vision Encoder</strong>: Converts images into visual token embeddings</li>
+<li><strong>Transformer-based Multi-modal Encoder-Decoder</strong>: Processes combined visual and text embeddings</li>
+</ul>
+</section>
+<section id="training-dataset-fld-5b" class="level3">
+<h3 class="anchored" data-anchor-id="training-dataset-fld-5b">Training Dataset (FLD-5B)</h3>
+<p><a href="images/png/Dataset_FLD-5B.png" class="lightbox" data-gallery="quarto-lightbox-gallery-4"><img src="images/png/Dataset_FLD-5B.png" class="img-fluid"></a></p>
+<ul>
+<li>126 million unique images</li>
+<li>5.4 billion comprehensive annotations, including:
+<ul>
+<li>500M text annotations</li>
+<li>1.3B region-text annotations</li>
+<li>3.6B text-phrase-region annotations</li>
+</ul></li>
+<li>Automated annotation pipeline using specialist models</li>
+<li>Iterative refinement process for high-quality labels</li>
+</ul>
+</section>
+<section id="key-capabilities" class="level3">
+<h3 class="anchored" data-anchor-id="key-capabilities">Key Capabilities</h3>
+<p>Florence-2 excels in multiple vision tasks:</p>
+<section id="zero-shot-performance" class="level4">
+<h4 class="anchored" data-anchor-id="zero-shot-performance">Zero-shot Performance</h4>
+<ul>
+<li>Image Captioning: Achieves 135.6 CIDEr score on COCO</li>
+<li>Visual Grounding: 84.4% recall@1 on Flickr30k</li>
+<li>Object Detection: 37.5 mAP on COCO val2017</li>
+<li>Referring Expression: 67.0% accuracy on RefCOCO</li>
+</ul>
+</section>
+<section id="fine-tuned-performance" class="level4">
+<h4 class="anchored" data-anchor-id="fine-tuned-performance">Fine-tuned Performance</h4>
+<ul>
+<li>Competitive with specialist models despite the smaller size</li>
+<li>Outperforms larger models in specific benchmarks</li>
+<li>Efficient adaptation to new tasks</li>
+</ul>
+</section>
+</section>
+<section id="practical-applications" class="level3">
+<h3 class="anchored" data-anchor-id="practical-applications">Practical Applications</h3>
+<p>Florence-2 can be applied across various domains:</p>
+<ol type="1">
+<li><strong>Content Understanding</strong>
+<ul>
+<li>Automated image captioning for accessibility</li>
+<li>Visual content moderation</li>
+<li>Media asset management</li>
+</ul></li>
+<li><strong>E-commerce</strong>
+<ul>
+<li>Product image analysis</li>
+<li>Visual search</li>
+<li>Automated product tagging</li>
+</ul></li>
+<li><strong>Healthcare</strong>
+<ul>
+<li>Medical image analysis</li>
+<li>Diagnostic assistance</li>
+<li>Research data processing</li>
+</ul></li>
+<li><strong>Security &amp; Surveillance</strong>
+<ul>
+<li>Object detection and tracking</li>
+<li>Anomaly detection</li>
+<li>Scene understanding</li>
+</ul></li>
+</ol>
+</section>
+<section id="comparing-florence-2-with-other-vlms" class="level3">
+<h3 class="anchored" data-anchor-id="comparing-florence-2-with-other-vlms">Comparing Florence-2 with other VLMs</h3>
+<p>Florence-2 stands out from other visual language models due to its impressive zero-shot capabilities. Unlike models like <a href="https://huggingface.co/blog/paligemma">Google PaliGemma</a>, which rely on extensive fine-tuning to adapt to various tasks, Florence-2 works right out of the box, as we will see in this lab. It can also compete with larger models like GPT-4V and Flamingo, which often have many more parameters but only sometimes match Florence-2’s performance. For example, Florence-2 achieves better zero-shot results than Kosmos-2 despite having over twice the parameters.</p>
+<p>In benchmark tests, Florence-2 has shown remarkable performance in tasks like COCO captioning and referring expression comprehension. It outperformed models like PolyFormer and UNINEXT in object detection and segmentation tasks on the <a href="https://docs.ultralytics.com/datasets/detect/coco/">COCO dataset</a>. It is a highly competitive choice for real-world applications where both performance and resource efficiency are crucial.</p>
+</section>
+</section>
+<section id="setup-and-installation" class="level2">
+<h2 class="anchored" data-anchor-id="setup-and-installation">Setup and Installation</h2>
+<p>Our choice of edge device is the Raspberry Pi 5 (Raspi-5). Its robust platform is equipped with the Broadcom BCM2712, a 2.4GHz quad-core 64-bit Arm Cortex-A76 CPU featuring Cryptographic Extension and enhanced caching capabilities. It boasts a VideoCore VII GPU, dual 4Kp60 HDMI® outputs with HDR, and a 4Kp60 HEVC decoder. Memory options include 4GB and 8GB of high-speed LPDDR4X SDRAM, with 8GB being our choice to run Florence-2. It also features expandable storage via a microSD card slot and a PCIe 2.0 interface for fast peripherals such as M.2 SSDs (Solid State Drives).</p>
+<blockquote class="blockquote">
+<p>For real applications, SSDs are a better option than SD cards.</p>
+</blockquote>
+<p>We suggest installing an Active Cooler, a dedicated clip-on cooling solution for Raspberry Pi 5 (Raspi-5), for this lab. It combines an aluminum heatsink with a temperature-controlled blower fan to keep the Raspi-5 operating comfortably under heavy loads, such as running Florense-2.</p>
+<p><a href="images/jpeg/raspi5-active-cooler.jpg" class="lightbox" data-gallery="quarto-lightbox-gallery-5"><img src="images/jpeg/raspi5-active-cooler.jpg" class="img-fluid"></a></p>
+<section id="environment-configuration" class="level3">
+<h3 class="anchored" data-anchor-id="environment-configuration">Environment configuration</h3>
+<p>To run <a href="https://huggingface.co/microsoft/Florence-2-base">Microsoft Florense-2</a> on the Raspberry Pi 5, we’ll need a few libraries:</p>
+<ol type="1">
+<li><strong><a href="https://huggingface.co/docs/transformers/en/index">Transformers</a></strong>:
+<ul>
+<li>Florence-2 uses the <code>transformers</code> library from Hugging Face for model loading and inference. This library provides the architecture for working with pre-trained vision-language models, making it easy to perform tasks like image captioning, object detection, and more. Essentially, <code>transformers</code> helps in interacting with the model, processing input prompts, and obtaining outputs.</li>
+</ul></li>
+<li><strong>PyTorch</strong>:
+<ul>
+<li>PyTorch is a deep learning framework that provides the infrastructure needed to run the Florence-2 model, which includes tensor operations, GPU acceleration (if a GPU is available), and model training/inference functionalities. The Florence-2 model is trained in PyTorch, and we need it to leverage its functions, layers, and computation capabilities to perform inferences on the Raspberry Pi.</li>
+</ul></li>
+<li><strong>Timm</strong> (PyTorch Image Models):
+<ul>
+<li>Florence-2 uses <code>timm</code> to access efficient implementations of vision models and pre-trained weights. Specifically, the <code>timm</code> library is utilized for the <strong>image encoder</strong> part of Florence-2, particularly for managing the DaViT architecture. It provides model definitions and optimized code for common vision tasks and allows the easy integration of different backbones that are lightweight and suitable for edge devices.</li>
+</ul></li>
+<li><strong>Einops</strong>:
+<ul>
+<li><code>Einops</code> is a library for flexible and powerful tensor operations. It makes it easy to reshape and manipulate tensor dimensions, which is especially important for the multi-modal processing done in Florence-2. Vision-language models like Florence-2 often need to rearrange image data, text embeddings, and visual embeddings to align correctly for the transformer blocks, and <code>einops</code> simplifies these complex operations, making the code more readable and concise.</li>
+</ul></li>
+</ol>
+<p>In short, these libraries enable different essential components of Florence-2:</p>
+<ul>
+<li><strong>Transformers</strong> and <strong>PyTorch</strong> are needed to load the model and run the inference.</li>
+<li><strong>Timm</strong> is used to access and efficiently implement the vision encoder.</li>
+<li><strong>Einops</strong> helps reshape data, facilitating the integration of visual and text features.</li>
+</ul>
+<p>All these components work together to help Florence-2 run seamlessly on our Raspberry Pi, allowing it to perform complex vision-language tasks relatively quickly.</p>
+<p>Considering that the Raspberry Pi already has its OS installed, let’s use <code>SSH</code> to reach it from another computer:</p>
+<div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">ssh</span> mjrovai@raspi-5.local</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>And check the IP allocated to it:</p>
+<div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">hostname</span> <span class="at">-I</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><code>192.168.4.209</code></p>
+<p><a href="images/png/ssh.png" class="lightbox" data-gallery="quarto-lightbox-gallery-6"><img src="images/png/ssh.png" class="img-fluid"></a></p>
+<p><strong>Updating the Raspberry Pi</strong></p>
+<p>First, ensure your Raspberry Pi is up to date:</p>
+<div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">sudo</span> apt update</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="fu">sudo</span> apt upgrade <span class="at">-y</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><strong>Initial setup for using PIP</strong>:</p>
+<div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">sudo</span> apt install python3-pip</span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="fu">sudo</span> rm /usr/lib/python3.11/EXTERNALLY-MANAGED</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--upgrade</span> pip</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><strong>Install Dependencies</strong></p>
+<div class="sourceCode" id="cb5"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">sudo</span> apt-get install libjpeg-dev libopenblas-dev libopenmpi-dev libomp-dev</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Let’s set up and activate a <strong>Virtual Environment</strong> for working with Florence-2:</p>
+<div class="sourceCode" id="cb6"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="ex">python3</span> <span class="at">-m</span> venv ~/florence</span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> ~/florence/bin/activate</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><strong>Install PyTorch</strong></p>
+<div class="sourceCode" id="cb7"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install setuptools numpy Cython</span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install requests</span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install torch torchvision <span class="at">--index-url</span> https://download.pytorch.org/whl/cpu</span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install torchaudio <span class="at">--index-url</span> https://download.pytorch.org/whl/cpu</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Let’s verify that PyTorch is correctly installed:</p>
+<p><a href="images/png/test-pytorch.png" class="lightbox" data-gallery="quarto-lightbox-gallery-7"><img src="images/png/test-pytorch.png" class="img-fluid"></a></p>
+<p><strong>Install Transformers, Timm and Einops</strong>:</p>
+<div class="sourceCode" id="cb8"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install transformers</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install timm einops</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><strong>Install the model</strong>:</p>
+<div class="sourceCode" id="cb9"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install autodistill-florence-2</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><strong>Jupyter Notebook and Python libraries</strong></p>
+<p>Installing a Jupyter Notebook to run and test our Python scripts is possible.</p>
+<div class="sourceCode" id="cb10"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install jupyter</span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install numpy Pillow matplotlib</span>
+<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a><span class="ex">jupyter</span> notebook <span class="at">--generate-config</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="testing-the-installation" class="level3">
+<h3 class="anchored" data-anchor-id="testing-the-installation">Testing the installation</h3>
+<p>Running the Jupyter Notebook on the remote computer</p>
+<div class="sourceCode" id="cb11"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="ex">jupyter</span> notebook <span class="at">--ip</span><span class="op">=</span>192.168.4.209 <span class="at">--no-browser</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Running the above command on the SSH terminal, we can see the local URL address to open the notebook:</p>
+<p><a href="images/png/jupyter.png" class="lightbox" data-gallery="quarto-lightbox-gallery-8"><img src="images/png/jupyter.png" class="img-fluid"></a></p>
+<p>The notebook with the code used on this initial test can be found on the Lab GitHub:</p>
+<ul>
+<li><a href="https://github.com/Mjrovai/EdgeML-with-Raspberry-Pi/blob/main/FLORENCE-2/notebooks/10-florence2_test.ipynb">10-florence2_test.ipynb</a></li>
+</ul>
+<p>We can access it on the remote computer by entering the Raspberry Pi’s IP address and the provided token in a web browser ( copy the entire URL from the terminal).</p>
+<p>From the Home page, create a new notebook [<code>Python 3 (ipykernel)</code> ] and copy and paste the <a href="https://huggingface.co/microsoft/Florence-2-base#how-to-get-started-with-the-model">example code</a> from Hugging Face Hub.</p>
+<p>The code is designed to run Florence-2 on a given image to perform <strong>object detection</strong>. It loads the model, processes an image and a prompt, and then generates a response to identify and describe the objects in the image.</p>
+<ul>
+<li>The <strong>processor</strong> helps prepare text and image inputs.</li>
+<li>The <strong>model</strong> takes the processed inputs to generate a meaningful response.</li>
+<li>The <strong>post-processing</strong> step refines the generated output into a more interpretable form, like bounding boxes for detected objects.</li>
+</ul>
+<blockquote class="blockquote">
+<p>This workflow leverages the versatility of Florence-2 to handle <strong>vision-language tasks</strong> and is implemented efficiently using PyTorch, Transformers, and related image-processing tools.</p>
+</blockquote>
+<div class="sourceCode" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> requests</span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> PIL <span class="im">import</span> Image</span>
+<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> torch</span>
+<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> transformers <span class="im">import</span> AutoProcessor, AutoModelForCausalLM </span>
+<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a>device <span class="op">=</span> <span class="st">"cuda:0"</span> <span class="cf">if</span> torch.cuda.is_available() <span class="cf">else</span> <span class="st">"cpu"</span></span>
+<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a>torch_dtype <span class="op">=</span> torch.float16 <span class="cf">if</span> torch.cuda.is_available() <span class="cf">else</span> torch.float32</span>
+<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-9"><a href="#cb12-9" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> AutoModelForCausalLM.from_pretrained(<span class="st">"microsoft/Florence-2-base"</span>,</span>
+<span id="cb12-10"><a href="#cb12-10" aria-hidden="true" tabindex="-1"></a>                                             torch_dtype<span class="op">=</span>torch_dtype, </span>
+<span id="cb12-11"><a href="#cb12-11" aria-hidden="true" tabindex="-1"></a>                                             trust_remote_code<span class="op">=</span><span class="va">True</span>).to(device)</span>
+<span id="cb12-12"><a href="#cb12-12" aria-hidden="true" tabindex="-1"></a>processor <span class="op">=</span> AutoProcessor.from_pretrained(<span class="st">"microsoft/Florence-2-base"</span>, </span>
+<span id="cb12-13"><a href="#cb12-13" aria-hidden="true" tabindex="-1"></a>                                          trust_remote_code<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb12-14"><a href="#cb12-14" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-15"><a href="#cb12-15" aria-hidden="true" tabindex="-1"></a>prompt <span class="op">=</span> <span class="st">"&lt;OD&gt;"</span></span>
+<span id="cb12-16"><a href="#cb12-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-17"><a href="#cb12-17" aria-hidden="true" tabindex="-1"></a>url <span class="op">=</span> <span class="st">"https://huggingface.co/datasets/huggingface/documentation-</span></span>
+<span id="cb12-18"><a href="#cb12-18" aria-hidden="true" tabindex="-1"></a><span class="er">images/resolve/main/transformers/tasks/car.jpg?download=true"</span></span>
+<span id="cb12-19"><a href="#cb12-19" aria-hidden="true" tabindex="-1"></a>image <span class="op">=</span> Image.<span class="bu">open</span>(requests.get(url, stream<span class="op">=</span><span class="va">True</span>).raw)</span>
+<span id="cb12-20"><a href="#cb12-20" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-21"><a href="#cb12-21" aria-hidden="true" tabindex="-1"></a>inputs <span class="op">=</span> processor(text<span class="op">=</span>prompt, images<span class="op">=</span>image, return_tensors<span class="op">=</span><span class="st">"pt"</span>).to(</span>
+<span id="cb12-22"><a href="#cb12-22" aria-hidden="true" tabindex="-1"></a>  device, torch_dtype)</span>
+<span id="cb12-23"><a href="#cb12-23" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-24"><a href="#cb12-24" aria-hidden="true" tabindex="-1"></a>generated_ids <span class="op">=</span> model.generate(</span>
+<span id="cb12-25"><a href="#cb12-25" aria-hidden="true" tabindex="-1"></a>    input_ids<span class="op">=</span>inputs[<span class="st">"input_ids"</span>],</span>
+<span id="cb12-26"><a href="#cb12-26" aria-hidden="true" tabindex="-1"></a>    pixel_values<span class="op">=</span>inputs[<span class="st">"pixel_values"</span>],</span>
+<span id="cb12-27"><a href="#cb12-27" aria-hidden="true" tabindex="-1"></a>    max_new_tokens<span class="op">=</span><span class="dv">1024</span>,</span>
+<span id="cb12-28"><a href="#cb12-28" aria-hidden="true" tabindex="-1"></a>    do_sample<span class="op">=</span><span class="va">False</span>,</span>
+<span id="cb12-29"><a href="#cb12-29" aria-hidden="true" tabindex="-1"></a>    num_beams<span class="op">=</span><span class="dv">3</span>,</span>
+<span id="cb12-30"><a href="#cb12-30" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb12-31"><a href="#cb12-31" aria-hidden="true" tabindex="-1"></a>generated_text <span class="op">=</span> processor.batch_decode(generated_ids, skip_special_tokens<span class="op">=</span><span class="va">False</span>)[<span class="dv">0</span>]</span>
+<span id="cb12-32"><a href="#cb12-32" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-33"><a href="#cb12-33" aria-hidden="true" tabindex="-1"></a>parsed_answer <span class="op">=</span> processor.post_process_generation(generated_text, task<span class="op">=</span><span class="st">"&lt;OD&gt;"</span>, </span>
+<span id="cb12-34"><a href="#cb12-34" aria-hidden="true" tabindex="-1"></a>                                                  image_size<span class="op">=</span>(image.width, </span>
+<span id="cb12-35"><a href="#cb12-35" aria-hidden="true" tabindex="-1"></a>                                                              image.height))</span>
+<span id="cb12-36"><a href="#cb12-36" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb12-37"><a href="#cb12-37" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(parsed_answer)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Let’s break down the provided code step by step:</p>
+<section id="importing-required-libraries" class="level4">
+<h4 class="anchored" data-anchor-id="importing-required-libraries"><strong>1. Importing Required Libraries</strong></h4>
+<div class="sourceCode" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> requests</span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> PIL <span class="im">import</span> Image</span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> torch</span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> transformers <span class="im">import</span> AutoProcessor, AutoModelForCausalLM </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<ul>
+<li><strong>requests</strong>: Used to make HTTP requests. In this case, it downloads an image from a URL.</li>
+<li><strong>PIL (Pillow)</strong>: Provides tools for manipulating images. Here, it’s used to open the downloaded image.</li>
+<li><strong>torch</strong>: PyTorch is imported to handle tensor operations and determine the hardware availability (CPU or GPU).</li>
+<li><strong>transformers</strong>: This module provides easy access to Florence-2 by using <code>AutoProcessor</code> and <code>AutoModelForCausalLM</code> to load pre-trained models and process inputs.</li>
+</ul>
+</section>
+<section id="determining-the-device-and-data-type" class="level4">
+<h4 class="anchored" data-anchor-id="determining-the-device-and-data-type"><strong>2. Determining the Device and Data Type</strong></h4>
+<div class="sourceCode" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>device <span class="op">=</span> <span class="st">"cuda:0"</span> <span class="cf">if</span> torch.cuda.is_available() <span class="cf">else</span> <span class="st">"cpu"</span></span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a>torch_dtype <span class="op">=</span> torch.float16 <span class="cf">if</span> torch.cuda.is_available() <span class="cf">else</span> torch.float32</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<ul>
+<li><strong>Device Setup</strong>: The code checks if a CUDA-enabled GPU is available (<code>torch.cuda.is_available()</code>). The device is set to “cuda:0” if a GPU is available. Otherwise, it defaults to <code>"cpu"</code> (our case here).</li>
+<li><strong>Data Type Setup</strong>: If a GPU is available, <code>torch.float16</code> is chosen, which uses half-precision floats to speed up processing and reduce memory usage. On the CPU, it defaults to <code>torch.float32</code> to maintain compatibility.</li>
+</ul>
+</section>
+<section id="loading-the-model-and-processor" class="level4">
+<h4 class="anchored" data-anchor-id="loading-the-model-and-processor"><strong>3. Loading the Model and Processor</strong></h4>
+<div class="sourceCode" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> AutoModelForCausalLM.from_pretrained(<span class="st">"microsoft/Florence-2-base"</span>, </span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>                                             torch_dtype<span class="op">=</span>torch_dtype,</span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a>                                             trust_remote_code<span class="op">=</span><span class="va">True</span>).to(device)</span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a>processor <span class="op">=</span> AutoProcessor.from_pretrained(<span class="st">"microsoft/Florence-2-base"</span>,</span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a>                                          trust_remote_code<span class="op">=</span><span class="va">True</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<ul>
+<li><strong>Model Initialization</strong>:
+<ul>
+<li><strong><code>AutoModelForCausalLM.from_pretrained()</code></strong> loads the pre-trained Florence-2 model from Microsoft’s repository on Hugging Face. The <code>torch_dtype</code> is set according to the available hardware (GPU/CPU), and <code>trust_remote_code=True</code> allows the use of any custom code that might be provided with the model.</li>
+<li><strong><code>.to(device)</code></strong> moves the model to the appropriate device (either CPU or GPU). In our case, it will be set to <code>CPU</code>.</li>
+</ul></li>
+<li><strong>Processor Initialization</strong>:
+<ul>
+<li><strong><code>AutoProcessor.from_pretrained()</code></strong> loads the processor for Florence-2. The processor is responsible for transforming text and image inputs into a format the model can work with (e.g., encoding text, normalizing images, etc.).</li>
+</ul></li>
+</ul>
+</section>
+</section>
+<section id="defining-the-prompt" class="level3">
+<h3 class="anchored" data-anchor-id="defining-the-prompt"><strong>4. Defining the Prompt</strong></h3>
+<div class="sourceCode" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>prompt <span class="op">=</span> <span class="st">"&lt;OD&gt;"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<ul>
+<li><strong>Prompt Definition</strong>: The string <code>"&lt;OD&gt;"</code> is used as a prompt. This refers to “Object Detection”, instructing the model to detect objects on the image.</li>
+</ul>
+<section id="downloading-and-loading-the-image" class="level4">
+<h4 class="anchored" data-anchor-id="downloading-and-loading-the-image"><strong>5. Downloading and Loading the Image</strong></h4>
+<div class="sourceCode" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>url <span class="op">=</span> <span class="st">"https://huggingface.co/datasets/huggingface/documentation-</span><span class="ch">\</span></span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="st">images/resolve/main/transformers/tasks/car.jpg?download=true"</span></span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a>image <span class="op">=</span> Image.<span class="bu">open</span>(requests.get(url, stream<span class="op">=</span><span class="va">True</span>).raw)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<ul>
+<li><strong>Downloading the Image</strong>: The <strong><code>requests.get()</code></strong> function fetches the image from the specified URL. The <code>stream=True</code> parameter ensures the image is streamed rather than downloaded completely at once.</li>
+<li><strong>Opening the Image</strong>: <strong><code>Image.open()</code></strong> opens the image so the model can process it.</li>
+</ul>
+</section>
+<section id="processing-inputs" class="level4">
+<h4 class="anchored" data-anchor-id="processing-inputs"><strong>6. Processing Inputs</strong></h4>
+<div class="sourceCode" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>inputs <span class="op">=</span> processor(text<span class="op">=</span>prompt, images<span class="op">=</span>image, return_tensors<span class="op">=</span><span class="st">"pt"</span>).to(device, </span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>                                                                      torch_dtype)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<ul>
+<li><strong>Processing Input Data</strong>: The <strong><code>processor()</code></strong> function processes the text (<code>prompt</code>) and the image (<code>image</code>). The <code>return_tensors="pt"</code> argument converts the processed data into PyTorch tensors, which are necessary for inputting data into the model.</li>
+<li><strong>Moving Inputs to Device</strong>: <strong><code>.to(device, torch_dtype)</code></strong> moves the inputs to the correct device (CPU or GPU) and assigns the appropriate data type.</li>
+</ul>
+</section>
+</section>
+<section id="generating-the-output" class="level3">
+<h3 class="anchored" data-anchor-id="generating-the-output"><strong>7. Generating the Output</strong></h3>
+<div class="sourceCode" id="cb19"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>generated_ids <span class="op">=</span> model.generate(</span>
+<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>    input_ids<span class="op">=</span>inputs[<span class="st">"input_ids"</span>],</span>
+<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>    pixel_values<span class="op">=</span>inputs[<span class="st">"pixel_values"</span>],</span>
+<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a>    max_new_tokens<span class="op">=</span><span class="dv">1024</span>,</span>
+<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a>    do_sample<span class="op">=</span><span class="va">False</span>,</span>
+<span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a>    num_beams<span class="op">=</span><span class="dv">3</span>,</span>
+<span id="cb19-7"><a href="#cb19-7" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<ul>
+<li><strong>Model Generation</strong>: <strong><code>model.generate()</code></strong> is used to generate the output based on the input data.
+<ul>
+<li><strong><code>input_ids</code></strong>: Represents the tokenized form of the prompt.</li>
+<li><strong><code>pixel_values</code></strong>: Contains the processed image data.</li>
+<li><strong><code>max_new_tokens=1024</code></strong>: Specifies the maximum number of new tokens to be generated in the response. This limits the response length.</li>
+<li><strong><code>do_sample=False</code></strong>: Disables sampling; instead, the generation uses deterministic methods (beam search).</li>
+<li><strong><code>num_beams=3</code></strong>: Enables beam search with three beams, which improves output quality by considering multiple possibilities during generation.</li>
+</ul></li>
+</ul>
+<section id="decoding-the-generated-text" class="level4">
+<h4 class="anchored" data-anchor-id="decoding-the-generated-text"><strong>8. Decoding the Generated Text</strong></h4>
+<div class="sourceCode" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>generated_text <span class="op">=</span> processor.batch_decode(generated_ids, skip_special_tokens<span class="op">=</span><span class="va">False</span>)[<span class="dv">0</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<ul>
+<li><strong>Batch Decode</strong>: <strong><code>processor.batch_decode()</code></strong> decodes the generated IDs (tokens) into readable text. The <code>skip_special_tokens=False</code> parameter means that the output will include any special tokens that may be part of the response.</li>
+</ul>
+</section>
+<section id="post-processing-the-generation" class="level4">
+<h4 class="anchored" data-anchor-id="post-processing-the-generation"><strong>9. Post-processing the Generation</strong></h4>
+<div class="sourceCode" id="cb21"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>parsed_answer <span class="op">=</span> processor.post_process_generation(generated_text, task<span class="op">=</span><span class="st">"&lt;OD&gt;"</span>, </span>
+<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>                                                  image_size<span class="op">=</span>(image.width, </span>
+<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>                                                              image.height))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<ul>
+<li><strong>Post-Processing</strong>: <strong><code>processor.post_process_generation()</code></strong> is called to process the generated text further, interpreting it based on the task (<code>"&lt;OD&gt;"</code> for object detection) and the size of the image.</li>
+<li>This function extracts specific information from the generated text, such as bounding boxes for detected objects, making the output more useful for visual tasks.</li>
+</ul>
+</section>
+<section id="printing-the-output" class="level4">
+<h4 class="anchored" data-anchor-id="printing-the-output"><strong>10. Printing the Output</strong></h4>
+<div class="sourceCode" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(parsed_answer)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<ul>
+<li>Finally, <strong><code>print(parsed_answer)</code></strong> displays the output, which could include object detection results, such as bounding box coordinates and labels for the detected objects in the image.</li>
+</ul>
+</section>
+<section id="result" class="level4">
+<h4 class="anchored" data-anchor-id="result"><strong>Result</strong></h4>
+<p>Running the code, we get as the Parsed Answer:</p>
+<div class="sourceCode" id="cb23"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="ex">{</span><span class="st">'&lt;OD&gt;'</span><span class="ex">:</span> {<span class="st">'bboxes'</span>: [[34.23999786376953, 160.0800018310547, 597.4400024414062, </span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="ex">371.7599792480469],</span> [272.32000732421875, 241.67999267578125, 303.67999267578125, </span>
+<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a><span class="ex">247.4399871826172],</span> [454.0799865722656, 276.7200012207031, 553.9199829101562, </span>
+<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a><span class="ex">370.79998779296875],</span> [96.31999969482422, 280.55999755859375, 198.0800018310547, </span>
+<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a><span class="ex">371.2799987792969]],</span> <span class="st">'labels'</span>: [<span class="st">'car'</span>, <span class="st">'door handle'</span>, <span class="st">'wheel'</span>, <span class="st">'wheel'</span>]}}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>First, Let’s inspect the image:</p>
+<div class="sourceCode" id="cb24"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
+<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a>plt.figure(figsize<span class="op">=</span>(<span class="dv">8</span>, <span class="dv">8</span>))</span>
+<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>plt.imshow(image)</span>
+<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a>plt.axis(<span class="st">'off'</span>)</span>
+<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a>plt.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><a href="images/png/fusca.png" class="lightbox" data-gallery="quarto-lightbox-gallery-9"><img src="images/png/fusca.png" class="img-fluid"></a></p>
+<p>By the Object Detection result, we can see that:</p>
+<div class="sourceCode" id="cb25"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="st">'labels'</span><span class="ex">:</span> [<span class="st">'car'</span>, <span class="st">'door handle'</span>, <span class="st">'wheel'</span>, <span class="st">'wheel'</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>It seems that at least a few objects were detected. we can also implement a code to draw the bounding boxes in the find objects:</p>
+<div class="sourceCode" id="cb26"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> plot_bbox(image, data):</span>
+<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a>   <span class="co"># Create a figure and axes</span></span>
+<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a>    fig, ax <span class="op">=</span> plt.subplots()</span>
+<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Display the image</span></span>
+<span id="cb26-6"><a href="#cb26-6" aria-hidden="true" tabindex="-1"></a>    ax.imshow(image)</span>
+<span id="cb26-7"><a href="#cb26-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb26-8"><a href="#cb26-8" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Plot each bounding box</span></span>
+<span id="cb26-9"><a href="#cb26-9" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> bbox, label <span class="kw">in</span> <span class="bu">zip</span>(data[<span class="st">'bboxes'</span>], data[<span class="st">'labels'</span>]):</span>
+<span id="cb26-10"><a href="#cb26-10" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Unpack the bounding box coordinates</span></span>
+<span id="cb26-11"><a href="#cb26-11" aria-hidden="true" tabindex="-1"></a>        x1, y1, x2, y2 <span class="op">=</span> bbox</span>
+<span id="cb26-12"><a href="#cb26-12" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Create a Rectangle patch</span></span>
+<span id="cb26-13"><a href="#cb26-13" aria-hidden="true" tabindex="-1"></a>        rect <span class="op">=</span> patches.Rectangle((x1, y1), x2<span class="op">-</span>x1, y2<span class="op">-</span>y1, linewidth<span class="op">=</span><span class="dv">1</span>, </span>
+<span id="cb26-14"><a href="#cb26-14" aria-hidden="true" tabindex="-1"></a>                                 edgecolor<span class="op">=</span><span class="st">'r'</span>, facecolor<span class="op">=</span><span class="st">'none'</span>)</span>
+<span id="cb26-15"><a href="#cb26-15" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Add the rectangle to the Axes</span></span>
+<span id="cb26-16"><a href="#cb26-16" aria-hidden="true" tabindex="-1"></a>        ax.add_patch(rect)</span>
+<span id="cb26-17"><a href="#cb26-17" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Annotate the label</span></span>
+<span id="cb26-18"><a href="#cb26-18" aria-hidden="true" tabindex="-1"></a>        plt.text(x1, y1, label, color<span class="op">=</span><span class="st">'white'</span>, fontsize<span class="op">=</span><span class="dv">8</span>, </span>
+<span id="cb26-19"><a href="#cb26-19" aria-hidden="true" tabindex="-1"></a>                 bbox<span class="op">=</span><span class="bu">dict</span>(facecolor<span class="op">=</span><span class="st">'red'</span>, alpha<span class="op">=</span><span class="fl">0.5</span>))</span>
+<span id="cb26-20"><a href="#cb26-20" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb26-21"><a href="#cb26-21" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Remove the axis ticks and labels</span></span>
+<span id="cb26-22"><a href="#cb26-22" aria-hidden="true" tabindex="-1"></a>    ax.axis(<span class="st">'off'</span>)</span>
+<span id="cb26-23"><a href="#cb26-23" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb26-24"><a href="#cb26-24" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Show the plot</span></span>
+<span id="cb26-25"><a href="#cb26-25" aria-hidden="true" tabindex="-1"></a>    plt.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<blockquote class="blockquote">
+<p><strong>Box (x0, y0, x1, y1)</strong>: Location tokens correspond to the top-left and bottom-right corners of a box.</p>
+</blockquote>
+<p>And running</p>
+<pre><code>plot_bbox(image, parsed_answer['&lt;OD&gt;'])</code></pre>
+<p>We get:</p>
+<p><a href="images/png/result_test.png" class="lightbox" data-gallery="quarto-lightbox-gallery-10"><img src="images/png/result_test.png" class="img-fluid"></a></p>
+</section>
+</section>
+</section>
+<section id="florence-2-tasks" class="level2">
+<h2 class="anchored" data-anchor-id="florence-2-tasks">Florence-2 Tasks</h2>
+<p>Florence-2 is designed to perform a variety of computer vision and vision-language tasks through <code>prompts</code>. These tasks can be activated by providing a specific textual prompt to the model, as we saw with <code>&lt;OD&gt;</code> (Object Detection).</p>
+<p>Florence-2’s versatility comes from combining these prompts, allowing us to guide the model’s behavior to perform specific vision tasks. Changing the prompt allows us to adapt Florence-2 to different tasks without needing task-specific modifications in the architecture. This capability directly results from Florence-2’s unified model architecture and large-scale multi-task training on the FLD-5B dataset.</p>
+<p>Here are some of the key tasks that Florence-2 can perform, along with example prompts:</p>
+<section id="object-detection-od" class="level4">
+<h4 class="anchored" data-anchor-id="object-detection-od">1. <strong>Object Detection (OD)</strong></h4>
+<ul>
+<li><strong>Prompt</strong>: <code>"&lt;OD&gt;"</code></li>
+<li><strong>Description</strong>: Identifies objects in an image and provides bounding boxes for each detected object. This task is helpful for applications like visual inspection, surveillance, and general object recognition.</li>
+</ul>
+</section>
+<section id="image-captioning" class="level4">
+<h4 class="anchored" data-anchor-id="image-captioning">2. <strong>Image Captioning</strong></h4>
+<ul>
+<li><strong>Prompt</strong>: <code>"&lt;CAPTION&gt;"</code></li>
+<li><strong>Description</strong>: Generates a textual description for an input image. This task helps the model describe what is happening in the image, providing a human-readable caption for content understanding.</li>
+</ul>
+</section>
+<section id="detailed-captioning" class="level4">
+<h4 class="anchored" data-anchor-id="detailed-captioning">3. <strong>Detailed Captioning</strong></h4>
+<ul>
+<li><strong>Prompt</strong>: <code>"&lt;DETAILED_CAPTION&gt;"</code></li>
+<li><strong>Description</strong>: Generates a more detailed caption with more nuanced information about the scene, such as the objects present and their relationships.</li>
+</ul>
+</section>
+<section id="visual-grounding" class="level4">
+<h4 class="anchored" data-anchor-id="visual-grounding">4. <strong>Visual Grounding</strong></h4>
+<ul>
+<li><strong>Prompt</strong>: <code>"&lt;CAPTION_TO_PHRASE_GROUNDING&gt;"</code></li>
+<li><strong>Description</strong>: Links a textual description to specific regions in an image. For example, given a prompt like “a green car,” the model highlights where the red car is in the image. This is useful for human-computer interaction, where you must find specific objects based on text.</li>
+</ul>
+</section>
+<section id="segmentation" class="level4">
+<h4 class="anchored" data-anchor-id="segmentation">5. <strong>Segmentation</strong></h4>
+<ul>
+<li><strong>Prompt</strong>: <code>"&lt;REFERRING_EXPRESSION_SEGMENTATION&gt;"</code></li>
+<li><strong>Description</strong>: Performs segmentation based on a referring expression, such as “the blue cup.” The model identifies and segments the specific region containing the object mentioned in the prompt (all related pixels).</li>
+</ul>
+</section>
+<section id="dense-region-captioning" class="level4">
+<h4 class="anchored" data-anchor-id="dense-region-captioning">6. <strong>Dense Region Captioning</strong></h4>
+<ul>
+<li><strong>Prompt</strong>: <code>"&lt;DENSE_REGION_CAPTION&gt;"</code></li>
+<li><strong>Description</strong>: Provides captions for multiple regions within an image, offering a detailed breakdown of all visible areas, including different objects and their relationships.</li>
+</ul>
+</section>
+<section id="ocr-with-region" class="level4">
+<h4 class="anchored" data-anchor-id="ocr-with-region">7. <strong>OCR with Region</strong></h4>
+<ul>
+<li><strong>Prompt</strong>: <code>"&lt;OCR_WITH_REGION&gt;"</code></li>
+<li><strong>Description</strong>: Performs Optical Character Recognition (OCR) on an image and provides bounding boxes for the detected text. This is useful for extracting and locating textual information in images, such as reading signs, labels, or other forms of text in images.</li>
+</ul>
+</section>
+<section id="phrase-grounding-for-specific-expressions" class="level4">
+<h4 class="anchored" data-anchor-id="phrase-grounding-for-specific-expressions">8. <strong>Phrase Grounding for Specific Expressions</strong></h4>
+<ul>
+<li><strong>Prompt</strong>: <code>"&lt;CAPTION_TO_PHRASE_GROUNDING&gt;"</code> along with a specific expression, such as <code>"a wine glass"</code>.</li>
+<li><strong>Description</strong>: Locates the area in the image that corresponds to a specific textual phrase. This task allows for identifying particular objects or elements when prompted with a word or keyword.</li>
+</ul>
+</section>
+<section id="open-vocabulary-object-detection" class="level4">
+<h4 class="anchored" data-anchor-id="open-vocabulary-object-detection">9. <strong>Open Vocabulary Object Detection</strong></h4>
+<ul>
+<li><strong>Prompt</strong>: <code>"&lt;OPEN_VOCABULARY_OD&gt;"</code></li>
+<li><strong>Description</strong>: The model can detect objects without being restricted to a predefined list of classes, making it helpful in recognizing a broader range of items based on general visual understanding.</li>
+</ul>
+</section>
+</section>
+<section id="exploring-computer-vision-and-vision-language-tasks" class="level2">
+<h2 class="anchored" data-anchor-id="exploring-computer-vision-and-vision-language-tasks">Exploring computer vision and vision-language tasks</h2>
+<p>For exploration, all codes can be found on the GitHub:</p>
+<ul>
+<li><a href="https://github.com/Mjrovai/EdgeML-with-Raspberry-Pi/blob/main/FLORENCE-2/notebooks/20-florence_2.ipynb">20-florence_2.ipynb</a></li>
+</ul>
+<p>Let’s use a couple of images created by Dall-E and upload them to the Rasp-5 (FileZilla can be used for that). The images will be saved on a sub-folder named <code>images</code> :</p>
+<div class="sourceCode" id="cb28"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a>dogs_cats <span class="op">=</span> Image.<span class="bu">open</span>(<span class="st">'./images/dogs-cats.jpg'</span>)</span>
+<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a>table <span class="op">=</span> Image.<span class="bu">open</span>(<span class="st">'./images/table.jpg'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><a href="images/png/img_test.png" class="lightbox" data-gallery="quarto-lightbox-gallery-11"><img src="images/png/img_test.png" class="img-fluid"></a></p>
+<p>Let’s create a function to facilitate our exploration and to keep track of the latency of the model for different tasks:</p>
+<div class="sourceCode" id="cb29"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> run_example(task_prompt, text_input<span class="op">=</span><span class="va">None</span>, image<span class="op">=</span><span class="va">None</span>):</span>
+<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a>    start_time <span class="op">=</span> time.perf_counter()  <span class="co"># Start timing</span></span>
+<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> text_input <span class="kw">is</span> <span class="va">None</span>:</span>
+<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a>        prompt <span class="op">=</span> task_prompt</span>
+<span id="cb29-5"><a href="#cb29-5" aria-hidden="true" tabindex="-1"></a>    <span class="cf">else</span>:</span>
+<span id="cb29-6"><a href="#cb29-6" aria-hidden="true" tabindex="-1"></a>        prompt <span class="op">=</span> task_prompt <span class="op">+</span> text_input</span>
+<span id="cb29-7"><a href="#cb29-7" aria-hidden="true" tabindex="-1"></a>    inputs <span class="op">=</span> processor(text<span class="op">=</span>prompt, images<span class="op">=</span>image, </span>
+<span id="cb29-8"><a href="#cb29-8" aria-hidden="true" tabindex="-1"></a>                       return_tensors<span class="op">=</span><span class="st">"pt"</span>).to(device)</span>
+<span id="cb29-9"><a href="#cb29-9" aria-hidden="true" tabindex="-1"></a>    generated_ids <span class="op">=</span> model.generate(</span>
+<span id="cb29-10"><a href="#cb29-10" aria-hidden="true" tabindex="-1"></a>      input_ids<span class="op">=</span>inputs[<span class="st">"input_ids"</span>],</span>
+<span id="cb29-11"><a href="#cb29-11" aria-hidden="true" tabindex="-1"></a>      pixel_values<span class="op">=</span>inputs[<span class="st">"pixel_values"</span>],</span>
+<span id="cb29-12"><a href="#cb29-12" aria-hidden="true" tabindex="-1"></a>      max_new_tokens<span class="op">=</span><span class="dv">1024</span>,</span>
+<span id="cb29-13"><a href="#cb29-13" aria-hidden="true" tabindex="-1"></a>      early_stopping<span class="op">=</span><span class="va">False</span>,</span>
+<span id="cb29-14"><a href="#cb29-14" aria-hidden="true" tabindex="-1"></a>      do_sample<span class="op">=</span><span class="va">False</span>,</span>
+<span id="cb29-15"><a href="#cb29-15" aria-hidden="true" tabindex="-1"></a>      num_beams<span class="op">=</span><span class="dv">3</span>,</span>
+<span id="cb29-16"><a href="#cb29-16" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb29-17"><a href="#cb29-17" aria-hidden="true" tabindex="-1"></a>    generated_text <span class="op">=</span> processor.batch_decode(generated_ids, </span>
+<span id="cb29-18"><a href="#cb29-18" aria-hidden="true" tabindex="-1"></a>                                            skip_special_tokens<span class="op">=</span><span class="va">False</span>)[<span class="dv">0</span>]</span>
+<span id="cb29-19"><a href="#cb29-19" aria-hidden="true" tabindex="-1"></a>    parsed_answer <span class="op">=</span> processor.post_process_generation(</span>
+<span id="cb29-20"><a href="#cb29-20" aria-hidden="true" tabindex="-1"></a>        generated_text,</span>
+<span id="cb29-21"><a href="#cb29-21" aria-hidden="true" tabindex="-1"></a>        task<span class="op">=</span>task_prompt,</span>
+<span id="cb29-22"><a href="#cb29-22" aria-hidden="true" tabindex="-1"></a>        image_size<span class="op">=</span>(image.width, image.height)</span>
+<span id="cb29-23"><a href="#cb29-23" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb29-24"><a href="#cb29-24" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb29-25"><a href="#cb29-25" aria-hidden="true" tabindex="-1"></a>    end_time <span class="op">=</span> time.perf_counter()  <span class="co"># End timing</span></span>
+<span id="cb29-26"><a href="#cb29-26" aria-hidden="true" tabindex="-1"></a>    elapsed_time <span class="op">=</span> end_time <span class="op">-</span> start_time  <span class="co"># Calculate elapsed time</span></span>
+<span id="cb29-27"><a href="#cb29-27" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f" </span><span class="ch">\n</span><span class="ss">[INFO] ==&gt; Florence-2-base (</span><span class="sc">{</span>task_prompt<span class="sc">}</span><span class="ss">), </span></span>
+<span id="cb29-28"><a href="#cb29-28" aria-hidden="true" tabindex="-1"></a><span class="er">          took {elapsed_time:.1f} seconds to execute.\n")</span></span>
+<span id="cb29-29"><a href="#cb29-29" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb29-30"><a href="#cb29-30" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> parsed_answer</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<section id="caption" class="level3">
+<h3 class="anchored" data-anchor-id="caption">Caption</h3>
+<p><strong>1. Dogs and Cats</strong></p>
+<div class="sourceCode" id="cb30"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a>run_example(task_prompt<span class="op">=</span><span class="st">'&lt;CAPTION&gt;'</span>,image<span class="op">=</span>dogs_cats)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb31"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a><span class="ex">[INFO]</span> ==<span class="op">&gt;</span> Florence-2-base <span class="er">(</span><span class="op">&lt;</span>CAPTION<span class="op">&gt;</span><span class="kw">)</span><span class="ex">,</span> took 16.1 seconds to execute.</span>
+<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a><span class="ex">{</span><span class="st">'&lt;CAPTION&gt;'</span><span class="ex">:</span> <span class="st">'A group of dogs and cats sitting in a garden.'</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><strong>2. Table</strong></p>
+<div class="sourceCode" id="cb32"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a>run_example(task_prompt<span class="op">=</span><span class="st">'&lt;CAPTION&gt;'</span>,image<span class="op">=</span>table)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb33"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="ex">[INFO]</span> ==<span class="op">&gt;</span> Florence-2-base <span class="er">(</span><span class="op">&lt;</span>CAPTION<span class="op">&gt;</span><span class="kw">)</span><span class="ex">,</span> took 16.5 seconds to execute.</span>
+<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a><span class="ex">{</span><span class="st">'&lt;CAPTION&gt;'</span><span class="ex">:</span> <span class="st">'A wooden table topped with a plate of fruit and a glass of wine.'</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="detailed_caption" class="level3">
+<h3 class="anchored" data-anchor-id="detailed_caption">DETAILED_CAPTION</h3>
+<p><strong>1. Dogs and Cats</strong></p>
+<div class="sourceCode" id="cb34"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a>run_example(task_prompt<span class="op">=</span><span class="st">'&lt;DETAILED_CAPTION&gt;'</span>,image<span class="op">=</span>dogs_cats)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb35"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a><span class="ex">[INFO]</span> ==<span class="op">&gt;</span> Florence-2-base <span class="er">(</span><span class="op">&lt;</span>DETAILED_CAPTION<span class="op">&gt;</span><span class="kw">)</span><span class="ex">,</span> took 25.5 seconds to execute.</span>
+<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb35-3"><a href="#cb35-3" aria-hidden="true" tabindex="-1"></a><span class="ex">{</span><span class="st">'&lt;DETAILED_CAPTION&gt;'</span><span class="ex">:</span> <span class="st">'The image shows a group of cats and dogs sitting on top of a</span></span>
+<span id="cb35-4"><a href="#cb35-4" aria-hidden="true" tabindex="-1"></a><span class="st">lush green field, surrounded by plants with flowers, trees, and a house in the </span></span>
+<span id="cb35-5"><a href="#cb35-5" aria-hidden="true" tabindex="-1"></a><span class="st">background. The sky is visible above them, creating a peaceful atmosphere.'</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><strong>2. Table</strong></p>
+<div class="sourceCode" id="cb36"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a>run_example(task_prompt<span class="op">=</span><span class="st">'&lt;DETAILED_CAPTION&gt;'</span>,image<span class="op">=</span>table)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb37"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a><span class="ex">[INFO]</span> ==<span class="op">&gt;</span> Florence-2-base <span class="er">(</span><span class="op">&lt;</span>DETAILED_CAPTION<span class="op">&gt;</span><span class="kw">)</span><span class="ex">,</span> took 26.8 seconds to execute.</span>
+<span id="cb37-2"><a href="#cb37-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb37-3"><a href="#cb37-3" aria-hidden="true" tabindex="-1"></a><span class="ex">{</span><span class="st">'&lt;DETAILED_CAPTION&gt;'</span><span class="ex">:</span> <span class="st">'The image shows a wooden table with a bottle of wine and a </span></span>
+<span id="cb37-4"><a href="#cb37-4" aria-hidden="true" tabindex="-1"></a><span class="st">glass of wine on it, surrounded by a variety of fruits such as apples, oranges, and </span></span>
+<span id="cb37-5"><a href="#cb37-5" aria-hidden="true" tabindex="-1"></a><span class="st">grapes. In the background, there are chairs, plants, trees, and a house, all slightly </span></span>
+<span id="cb37-6"><a href="#cb37-6" aria-hidden="true" tabindex="-1"></a><span class="st">blurred.'</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="more_detailed_caption" class="level3">
+<h3 class="anchored" data-anchor-id="more_detailed_caption">MORE_DETAILED_CAPTION</h3>
+<p><strong>1. Dogs and Cats</strong></p>
+<div class="sourceCode" id="cb38"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a>run_example(task_prompt<span class="op">=</span><span class="st">'&lt;MORE_DETAILED_CAPTION&gt;'</span>,image<span class="op">=</span>dogs_cats)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb39"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a><span class="ex">[INFO]</span> ==<span class="op">&gt;</span> Florence-2-base <span class="er">(</span><span class="op">&lt;</span>MORE_DETAILED_CAPTION<span class="op">&gt;</span><span class="kw">)</span><span class="ex">,</span> took 49.8 seconds to execute.</span>
+<span id="cb39-2"><a href="#cb39-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb39-3"><a href="#cb39-3" aria-hidden="true" tabindex="-1"></a><span class="ex">{</span><span class="st">'&lt;MORE_DETAILED_CAPTION&gt;'</span><span class="ex">:</span> <span class="st">'The image shows a group of four cats and a dog in a garden. </span></span>
+<span id="cb39-4"><a href="#cb39-4" aria-hidden="true" tabindex="-1"></a><span class="st">The garden is filled with colorful flowers and plants, and there is a pathway leading up </span></span>
+<span id="cb39-5"><a href="#cb39-5" aria-hidden="true" tabindex="-1"></a><span class="st">to a house in the background. The main focus of the image is a large German Shepherd dog </span></span>
+<span id="cb39-6"><a href="#cb39-6" aria-hidden="true" tabindex="-1"></a><span class="st">standing on the left side of the garden, with its tongue hanging out and its mouth open, </span></span>
+<span id="cb39-7"><a href="#cb39-7" aria-hidden="true" tabindex="-1"></a><span class="st">as if it is panting or panting. On the right side, there are two smaller cats, one orange </span></span>
+<span id="cb39-8"><a href="#cb39-8" aria-hidden="true" tabindex="-1"></a><span class="st">and one gray, sitting on the grass. In the background, there is another golden retriever </span></span>
+<span id="cb39-9"><a href="#cb39-9" aria-hidden="true" tabindex="-1"></a><span class="st">dog sitting and looking at the camera. The sky is blue and the sun is shining, creating a </span></span>
+<span id="cb39-10"><a href="#cb39-10" aria-hidden="true" tabindex="-1"></a><span class="st">warm and inviting atmosphere.'</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><strong>2. Table</strong></p>
+<div class="sourceCode" id="cb40"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a>run_example(task_prompt<span class="op">=</span><span class="st">'&lt; MORE_DETAILED_CAPTION&gt;'</span>,image<span class="op">=</span>table)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb41"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a><span class="ex">INFO]</span> ==<span class="op">&gt;</span> Florence-2-base <span class="er">(</span><span class="op">&lt;</span>MORE_DETAILED_CAPTION<span class="op">&gt;</span><span class="kw">)</span><span class="ex">,</span> took 32.4 seconds to execute.</span>
+<span id="cb41-2"><a href="#cb41-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb41-3"><a href="#cb41-3" aria-hidden="true" tabindex="-1"></a><span class="ex">{</span><span class="st">'&lt;MORE_DETAILED_CAPTION&gt;'</span><span class="ex">:</span> <span class="st">'The image shows a wooden table with a wooden tray on it. On </span></span>
+<span id="cb41-4"><a href="#cb41-4" aria-hidden="true" tabindex="-1"></a><span class="st">the tray, there are various fruits such as grapes, oranges, apples, and grapes. There is </span></span>
+<span id="cb41-5"><a href="#cb41-5" aria-hidden="true" tabindex="-1"></a><span class="st">also a bottle of red wine on the table. The background shows a garden with trees and a </span></span>
+<span id="cb41-6"><a href="#cb41-6" aria-hidden="true" tabindex="-1"></a><span class="st">house. The overall mood of the image is peaceful and serene.'</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<blockquote class="blockquote">
+<p>We can note that the more detailed the caption task, the longer the latency and the possibility of mistakes (like “The image shows a group of four cats and a dog in a garden”, instead of two dogs and three cats).</p>
+</blockquote>
+</section>
+<section id="od---object-detection" class="level3">
+<h3 class="anchored" data-anchor-id="od---object-detection">OD - Object Detection</h3>
+<p>We can run the same previous function for object detection using the prompt <code>&lt;OD&gt;</code>.</p>
+<div class="sourceCode" id="cb42"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;OD&gt;'</span></span>
+<span id="cb42-2"><a href="#cb42-2" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt,image<span class="op">=</span>dogs_cats)</span>
+<span id="cb42-3"><a href="#cb42-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(results)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Let’s see the result:</p>
+<div class="sourceCode" id="cb43"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a><span class="ex">[INFO]</span> ==<span class="op">&gt;</span> Florence-2-base <span class="er">(</span><span class="op">&lt;</span>OD<span class="op">&gt;</span><span class="kw">)</span><span class="ex">,</span> took 20.9 seconds to execute.</span>
+<span id="cb43-2"><a href="#cb43-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb43-3"><a href="#cb43-3" aria-hidden="true" tabindex="-1"></a><span class="ex">{</span><span class="st">'&lt;OD&gt;'</span><span class="ex">:</span> {<span class="st">'bboxes'</span>: [[737.7920532226562, 571.904052734375, 1022.4640502929688, </span>
+<span id="cb43-4"><a href="#cb43-4" aria-hidden="true" tabindex="-1"></a><span class="ex">980.4800415039062],</span> [0.5120000243186951, 593.4080200195312, 211.4560089111328, </span>
+<span id="cb43-5"><a href="#cb43-5" aria-hidden="true" tabindex="-1"></a><span class="ex">991.7440185546875],</span> [445.9520263671875, 721.4080200195312, 680.4480590820312, </span>
+<span id="cb43-6"><a href="#cb43-6" aria-hidden="true" tabindex="-1"></a><span class="ex">850.4320678710938],</span> [39.42400360107422, 91.64800262451172, 491.0080261230469, </span>
+<span id="cb43-7"><a href="#cb43-7" aria-hidden="true" tabindex="-1"></a><span class="ex">933.3760375976562],</span> [570.8800048828125, 184.83201599121094, 974.3360595703125, </span>
+<span id="cb43-8"><a href="#cb43-8" aria-hidden="true" tabindex="-1"></a><span class="ex">782.8480224609375]],</span> <span class="st">'labels'</span>: [<span class="st">'cat'</span>, <span class="st">'cat'</span>, <span class="st">'cat'</span>, <span class="st">'dog'</span>, <span class="st">'dog'</span>]}}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Only by the labels <code>['cat,' 'cat,' 'cat,' 'dog,' 'dog']</code> is it possible to see that the main objects in the image were captured. Let’s apply the function used before to draw the bounding boxes:</p>
+<div class="sourceCode" id="cb44"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb44-1"><a href="#cb44-1" aria-hidden="true" tabindex="-1"></a>plot_bbox(dogs_cats, results[<span class="st">'&lt;OD&gt;'</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><a href="images/png/od_cat_dogs.png" class="lightbox" data-gallery="quarto-lightbox-gallery-12"><img src="images/png/od_cat_dogs.png" class="img-fluid"></a></p>
+<p>Let’s also do it with the Table image:</p>
+<div class="sourceCode" id="cb45"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb45-1"><a href="#cb45-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;OD&gt;'</span></span>
+<span id="cb45-2"><a href="#cb45-2" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt,image<span class="op">=</span>table)</span>
+<span id="cb45-3"><a href="#cb45-3" aria-hidden="true" tabindex="-1"></a>plot_bbox(table, results[<span class="st">'&lt;OD&gt;'</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<pre class="ba"><code>[INFO] ==&gt; Florence-2-base (&lt;OD&gt;), took 40.8 seconds to execute.</code></pre>
+<p><a href="images/png/od_table.png" class="lightbox" data-gallery="quarto-lightbox-gallery-13"><img src="images/png/od_table.png" class="img-fluid"></a></p>
+</section>
+<section id="dense_region_caption" class="level3">
+<h3 class="anchored" data-anchor-id="dense_region_caption">DENSE_REGION_CAPTION</h3>
+<p>It is possible to mix the classic Object Detection with the Caption task in specific sub-regions of the image:</p>
+<div class="sourceCode" id="cb47"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb47-1"><a href="#cb47-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;DENSE_REGION_CAPTION&gt;'</span></span>
+<span id="cb47-2"><a href="#cb47-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb47-3"><a href="#cb47-3" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt,image<span class="op">=</span>dogs_cats)</span>
+<span id="cb47-4"><a href="#cb47-4" aria-hidden="true" tabindex="-1"></a>plot_bbox(dogs_cats, results[<span class="st">'&lt;DENSE_REGION_CAPTION&gt;'</span>])</span>
+<span id="cb47-5"><a href="#cb47-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb47-6"><a href="#cb47-6" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt,image<span class="op">=</span>table)</span>
+<span id="cb47-7"><a href="#cb47-7" aria-hidden="true" tabindex="-1"></a>plot_bbox(table, results[<span class="st">'&lt;DENSE_REGION_CAPTION&gt;'</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><a href="images/png/od_caption_cat_dog_table.png" class="lightbox" data-gallery="quarto-lightbox-gallery-14"><img src="images/png/od_caption_cat_dog_table.png" class="img-fluid"></a></p>
+</section>
+<section id="caption_to_phrase_grounding" class="level3">
+<h3 class="anchored" data-anchor-id="caption_to_phrase_grounding">CAPTION_TO_PHRASE_GROUNDING</h3>
+<p>With this task, we can enter with a caption, such as “a wine glass”, “a wine bottle,” or “a half orange,” and Florence-2 will localize the object in the image:</p>
+<div class="sourceCode" id="cb48"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb48-1"><a href="#cb48-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'</span></span>
+<span id="cb48-2"><a href="#cb48-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb48-3"><a href="#cb48-3" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt, text_input<span class="op">=</span><span class="st">"a wine bottle"</span>,image<span class="op">=</span>table)</span>
+<span id="cb48-4"><a href="#cb48-4" aria-hidden="true" tabindex="-1"></a>plot_bbox(table, results[<span class="st">'&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'</span>])</span>
+<span id="cb48-5"><a href="#cb48-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb48-6"><a href="#cb48-6" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt, text_input<span class="op">=</span><span class="st">"a wine glass"</span>,image<span class="op">=</span>table)</span>
+<span id="cb48-7"><a href="#cb48-7" aria-hidden="true" tabindex="-1"></a>plot_bbox(table, results[<span class="st">'&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'</span>])</span>
+<span id="cb48-8"><a href="#cb48-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb48-9"><a href="#cb48-9" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt, text_input<span class="op">=</span><span class="st">"a half orange"</span>,image<span class="op">=</span>table)</span>
+<span id="cb48-10"><a href="#cb48-10" aria-hidden="true" tabindex="-1"></a>plot_bbox(table, results[<span class="st">'&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><a href="images/png/caption_ground.png" class="lightbox" data-gallery="quarto-lightbox-gallery-15"><img src="images/png/caption_ground.png" class="img-fluid"></a></p>
+<div class="sourceCode" id="cb49"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb49-1"><a href="#cb49-1" aria-hidden="true" tabindex="-1"></a><span class="ex">[INFO]</span> ==<span class="op">&gt;</span> Florence-2-base <span class="er">(</span><span class="op">&lt;</span>CAPTION_TO_PHRASE_GROUNDING<span class="op">&gt;</span><span class="kw">)</span><span class="ex">,</span> took 15.7 seconds to execute</span>
+<span id="cb49-2"><a href="#cb49-2" aria-hidden="true" tabindex="-1"></a><span class="ex">each</span> task.</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="cascade-tasks" class="level3">
+<h3 class="anchored" data-anchor-id="cascade-tasks">Cascade Tasks</h3>
+<p>We can also enter the image caption as the input text to push Florence-2 to find more objects:</p>
+<div class="sourceCode" id="cb50"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb50-1"><a href="#cb50-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;CAPTION&gt;'</span></span>
+<span id="cb50-2"><a href="#cb50-2" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt,image<span class="op">=</span>dogs_cats)</span>
+<span id="cb50-3"><a href="#cb50-3" aria-hidden="true" tabindex="-1"></a>text_input <span class="op">=</span> results[task_prompt]</span>
+<span id="cb50-4"><a href="#cb50-4" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'</span></span>
+<span id="cb50-5"><a href="#cb50-5" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt, text_input,image<span class="op">=</span>dogs_cats)</span>
+<span id="cb50-6"><a href="#cb50-6" aria-hidden="true" tabindex="-1"></a>plot_bbox(dogs_cats, results[<span class="st">'&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Changing the task_prompt among <code>&lt;CAPTION,&gt;</code> <code>&lt;DETAILED_CAPTION&gt;</code> and <code>&lt;MORE_DETAILED_CAPTION&gt;</code>, we will get more objects in the image.</p>
+<p><a href="images/png/cascade.png" class="lightbox" data-gallery="quarto-lightbox-gallery-16"><img src="images/png/cascade.png" class="img-fluid"></a></p>
+</section>
+<section id="open_vocabulary_detection" class="level3">
+<h3 class="anchored" data-anchor-id="open_vocabulary_detection">OPEN_VOCABULARY_DETECTION</h3>
+<p><code>&lt;OPEN_VOCABULARY_DETECTION&gt;</code> allows Florence-2 to detect recognizable objects in an image without relying on a predefined list of categories, making it a versatile tool for identifying various items that may not have been explicitly labeled during training. Unlike <code>&lt;CAPTION_TO_PHRASE_GROUNDING&gt;</code>, which requires a specific text phrase to locate and highlight a particular object in an image, <code>&lt;OPEN_VOCABULARY_DETECTION&gt;</code> performs a broad scan to find and classify all objects present.</p>
+<p>This makes <code>&lt;OPEN_VOCABULARY_DETECTION&gt;</code> particularly useful for applications where you need a comprehensive overview of everything in an image without prior knowledge of what to expect. Enter with a text describing specific objects not previously detected, resulting in their detection. For example:</p>
+<div class="sourceCode" id="cb51"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb51-1"><a href="#cb51-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;OPEN_VOCABULARY_DETECTION&gt;'</span></span>
+<span id="cb51-2"><a href="#cb51-2" aria-hidden="true" tabindex="-1"></a>text <span class="op">=</span> [<span class="st">"a house"</span>, <span class="st">"a tree"</span>, <span class="st">"a standing cat at the left"</span>, </span>
+<span id="cb51-3"><a href="#cb51-3" aria-hidden="true" tabindex="-1"></a>        <span class="st">"a sleeping cat on the ground"</span>, <span class="st">"a standing cat at the right"</span>, </span>
+<span id="cb51-4"><a href="#cb51-4" aria-hidden="true" tabindex="-1"></a>        <span class="st">"a yellow cat"</span>]</span>
+<span id="cb51-5"><a href="#cb51-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> txt <span class="kw">in</span> text:</span>
+<span id="cb51-6"><a href="#cb51-6" aria-hidden="true" tabindex="-1"></a>    results <span class="op">=</span> run_example(task_prompt, text_input<span class="op">=</span>txt,image<span class="op">=</span>dogs_cats)</span>
+<span id="cb51-7"><a href="#cb51-7" aria-hidden="true" tabindex="-1"></a>    bbox_results  <span class="op">=</span> convert_to_od_format(results[<span class="st">'&lt;OPEN_VOCABULARY_DETECTION&gt;'</span>])</span>
+<span id="cb51-8"><a href="#cb51-8" aria-hidden="true" tabindex="-1"></a>    plot_bbox(dogs_cats, bbox_results)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><a href="images/png/open_vacab_exemples.png" class="lightbox" data-gallery="quarto-lightbox-gallery-17"><img src="images/png/open_vacab_exemples.png" class="img-fluid"></a></p>
+<div class="sourceCode" id="cb52"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a><span class="ex">[INFO]</span> ==<span class="op">&gt;</span> Florence-2-base <span class="er">(</span><span class="op">&lt;</span>OPEN_VOCABULARY_DETECTION<span class="op">&gt;</span><span class="kw">)</span><span class="ex">,</span> took 15.1 seconds to execute </span>
+<span id="cb52-2"><a href="#cb52-2" aria-hidden="true" tabindex="-1"></a><span class="ex">each</span> task.</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<blockquote class="blockquote">
+<p>Note: Trying to use Florence-2 to find objects that were not found can leads to mistakes (see exaamples on the Notebook).</p>
+</blockquote>
+</section>
+<section id="referring-expression-segmentation" class="level3">
+<h3 class="anchored" data-anchor-id="referring-expression-segmentation">Referring expression segmentation</h3>
+<p>We can also segment a specific object in the image and give its description (caption), such as “a wine bottle” on the table image or “a German Sheppard” on the dogs_cats.</p>
+<p>Referring expression segmentation results format: <code>{'&lt;REFERRING_EXPRESSION_SEGMENTATION&gt;': {'Polygons': [[[polygon]], ...], 'labels': ['', '', ...]}}</code>, one object is represented by a list of polygons. each polygon is <code>[x1, y1, x2, y2, ..., xn, yn]</code>.</p>
+<blockquote class="blockquote">
+<p><strong>Polygon (x1, y1, …, xn, yn)</strong>: Location tokens represent the vertices of a polygon in clockwise order.</p>
+</blockquote>
+<p>So, let’s first create a function to plot the segmentation:</p>
+<div class="sourceCode" id="cb53"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb53-1"><a href="#cb53-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> PIL <span class="im">import</span> Image, ImageDraw, ImageFont</span>
+<span id="cb53-2"><a href="#cb53-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> copy</span>
+<span id="cb53-3"><a href="#cb53-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> random</span>
+<span id="cb53-4"><a href="#cb53-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
+<span id="cb53-5"><a href="#cb53-5" aria-hidden="true" tabindex="-1"></a>colormap <span class="op">=</span> [<span class="st">'blue'</span>,<span class="st">'orange'</span>,<span class="st">'green'</span>,<span class="st">'purple'</span>,<span class="st">'brown'</span>,<span class="st">'pink'</span>,<span class="st">'gray'</span>,<span class="st">'olive'</span>,</span>
+<span id="cb53-6"><a href="#cb53-6" aria-hidden="true" tabindex="-1"></a>    <span class="st">'cyan'</span>,<span class="st">'red'</span>,<span class="st">'lime'</span>,<span class="st">'indigo'</span>,<span class="st">'violet'</span>,<span class="st">'aqua'</span>,<span class="st">'magenta'</span>,<span class="st">'coral'</span>,<span class="st">'gold'</span>,</span>
+<span id="cb53-7"><a href="#cb53-7" aria-hidden="true" tabindex="-1"></a>    <span class="st">'tan'</span>,<span class="st">'skyblue'</span>]</span>
+<span id="cb53-8"><a href="#cb53-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb53-9"><a href="#cb53-9" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> draw_polygons(image, prediction, fill_mask<span class="op">=</span><span class="va">False</span>):</span>
+<span id="cb53-10"><a href="#cb53-10" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""</span></span>
+<span id="cb53-11"><a href="#cb53-11" aria-hidden="true" tabindex="-1"></a><span class="co">    Draws segmentation masks with polygons on an image.</span></span>
+<span id="cb53-12"><a href="#cb53-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb53-13"><a href="#cb53-13" aria-hidden="true" tabindex="-1"></a><span class="co">    Parameters:</span></span>
+<span id="cb53-14"><a href="#cb53-14" aria-hidden="true" tabindex="-1"></a><span class="co">    - image_path: Path to the image file.</span></span>
+<span id="cb53-15"><a href="#cb53-15" aria-hidden="true" tabindex="-1"></a><span class="co">    - prediction: Dictionary containing 'polygons' and 'labels' keys.</span></span>
+<span id="cb53-16"><a href="#cb53-16" aria-hidden="true" tabindex="-1"></a><span class="co">                  'polygons' is a list of lists, each containing vertices </span></span>
+<span id="cb53-17"><a href="#cb53-17" aria-hidden="true" tabindex="-1"></a><span class="co">                  of a polygon.</span></span>
+<span id="cb53-18"><a href="#cb53-18" aria-hidden="true" tabindex="-1"></a><span class="co">                  'labels' is a list of labels corresponding to each polygon.</span></span>
+<span id="cb53-19"><a href="#cb53-19" aria-hidden="true" tabindex="-1"></a><span class="co">    - fill_mask: Boolean indicating whether to fill the polygons with color.</span></span>
+<span id="cb53-20"><a href="#cb53-20" aria-hidden="true" tabindex="-1"></a><span class="co">    """</span></span>
+<span id="cb53-21"><a href="#cb53-21" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Load the image</span></span>
+<span id="cb53-22"><a href="#cb53-22" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb53-23"><a href="#cb53-23" aria-hidden="true" tabindex="-1"></a>    draw <span class="op">=</span> ImageDraw.Draw(image)</span>
+<span id="cb53-24"><a href="#cb53-24" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb53-25"><a href="#cb53-25" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb53-26"><a href="#cb53-26" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Set up scale factor if needed (use 1 if not scaling)</span></span>
+<span id="cb53-27"><a href="#cb53-27" aria-hidden="true" tabindex="-1"></a>    scale <span class="op">=</span> <span class="dv">1</span></span>
+<span id="cb53-28"><a href="#cb53-28" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb53-29"><a href="#cb53-29" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Iterate over polygons and labels</span></span>
+<span id="cb53-30"><a href="#cb53-30" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> polygons, label <span class="kw">in</span> <span class="bu">zip</span>(prediction[<span class="st">'polygons'</span>], prediction[<span class="st">'labels'</span>]):</span>
+<span id="cb53-31"><a href="#cb53-31" aria-hidden="true" tabindex="-1"></a>        color <span class="op">=</span> random.choice(colormap)</span>
+<span id="cb53-32"><a href="#cb53-32" aria-hidden="true" tabindex="-1"></a>        fill_color <span class="op">=</span> random.choice(colormap) <span class="cf">if</span> fill_mask <span class="cf">else</span> <span class="va">None</span></span>
+<span id="cb53-33"><a href="#cb53-33" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb53-34"><a href="#cb53-34" aria-hidden="true" tabindex="-1"></a>        <span class="cf">for</span> _polygon <span class="kw">in</span> polygons:</span>
+<span id="cb53-35"><a href="#cb53-35" aria-hidden="true" tabindex="-1"></a>            _polygon <span class="op">=</span> np.array(_polygon).reshape(<span class="op">-</span><span class="dv">1</span>, <span class="dv">2</span>)</span>
+<span id="cb53-36"><a href="#cb53-36" aria-hidden="true" tabindex="-1"></a>            <span class="cf">if</span> <span class="bu">len</span>(_polygon) <span class="op">&lt;</span> <span class="dv">3</span>:</span>
+<span id="cb53-37"><a href="#cb53-37" aria-hidden="true" tabindex="-1"></a>                <span class="bu">print</span>(<span class="st">'Invalid polygon:'</span>, _polygon)</span>
+<span id="cb53-38"><a href="#cb53-38" aria-hidden="true" tabindex="-1"></a>                <span class="cf">continue</span></span>
+<span id="cb53-39"><a href="#cb53-39" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb53-40"><a href="#cb53-40" aria-hidden="true" tabindex="-1"></a>            _polygon <span class="op">=</span> (_polygon <span class="op">*</span> scale).reshape(<span class="op">-</span><span class="dv">1</span>).tolist()</span>
+<span id="cb53-41"><a href="#cb53-41" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb53-42"><a href="#cb53-42" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Draw the polygon</span></span>
+<span id="cb53-43"><a href="#cb53-43" aria-hidden="true" tabindex="-1"></a>            <span class="cf">if</span> fill_mask:</span>
+<span id="cb53-44"><a href="#cb53-44" aria-hidden="true" tabindex="-1"></a>                draw.polygon(_polygon, outline<span class="op">=</span>color, fill<span class="op">=</span>fill_color)</span>
+<span id="cb53-45"><a href="#cb53-45" aria-hidden="true" tabindex="-1"></a>            <span class="cf">else</span>:</span>
+<span id="cb53-46"><a href="#cb53-46" aria-hidden="true" tabindex="-1"></a>                draw.polygon(_polygon, outline<span class="op">=</span>color)</span>
+<span id="cb53-47"><a href="#cb53-47" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb53-48"><a href="#cb53-48" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Draw the label text</span></span>
+<span id="cb53-49"><a href="#cb53-49" aria-hidden="true" tabindex="-1"></a>            draw.text((_polygon[<span class="dv">0</span>] <span class="op">+</span> <span class="dv">8</span>, _polygon[<span class="dv">1</span>] <span class="op">+</span> <span class="dv">2</span>), label, fill<span class="op">=</span>color)</span>
+<span id="cb53-50"><a href="#cb53-50" aria-hidden="true" tabindex="-1"></a>    </span>
+<span id="cb53-51"><a href="#cb53-51" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Save or display the image</span></span>
+<span id="cb53-52"><a href="#cb53-52" aria-hidden="true" tabindex="-1"></a>    <span class="co">#image.show()  # Display the image</span></span>
+<span id="cb53-53"><a href="#cb53-53" aria-hidden="true" tabindex="-1"></a>    display(image)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Now we can run the functions:</p>
+<div class="sourceCode" id="cb54"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;REFERRING_EXPRESSION_SEGMENTATION&gt;'</span></span>
+<span id="cb54-2"><a href="#cb54-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb54-3"><a href="#cb54-3" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt, text_input<span class="op">=</span><span class="st">"a wine bottle"</span>,image<span class="op">=</span>table)</span>
+<span id="cb54-4"><a href="#cb54-4" aria-hidden="true" tabindex="-1"></a>output_image <span class="op">=</span> copy.deepcopy(table)</span>
+<span id="cb54-5"><a href="#cb54-5" aria-hidden="true" tabindex="-1"></a>draw_polygons(output_image, </span>
+<span id="cb54-6"><a href="#cb54-6" aria-hidden="true" tabindex="-1"></a>              results[<span class="st">'&lt;REFERRING_EXPRESSION_SEGMENTATION&gt;'</span>], </span>
+<span id="cb54-7"><a href="#cb54-7" aria-hidden="true" tabindex="-1"></a>              fill_mask<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb54-8"><a href="#cb54-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb54-9"><a href="#cb54-9" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt, text_input<span class="op">=</span><span class="st">"a german sheppard"</span>,image<span class="op">=</span>dogs_cats)</span>
+<span id="cb54-10"><a href="#cb54-10" aria-hidden="true" tabindex="-1"></a>output_image <span class="op">=</span> copy.deepcopy(dogs_cats)</span>
+<span id="cb54-11"><a href="#cb54-11" aria-hidden="true" tabindex="-1"></a>draw_polygons(output_image, </span>
+<span id="cb54-12"><a href="#cb54-12" aria-hidden="true" tabindex="-1"></a>              results[<span class="st">'&lt;REFERRING_EXPRESSION_SEGMENTATION&gt;'</span>], </span>
+<span id="cb54-13"><a href="#cb54-13" aria-hidden="true" tabindex="-1"></a>              fill_mask<span class="op">=</span><span class="va">True</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><a href="images/png/dog_bottle_seg.png" class="lightbox" data-gallery="quarto-lightbox-gallery-18"><img src="images/png/dog_bottle_seg.png" class="img-fluid"></a></p>
+<div class="sourceCode" id="cb55"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb55-1"><a href="#cb55-1" aria-hidden="true" tabindex="-1"></a><span class="ex">[INFO]</span> ==<span class="op">&gt;</span> Florence-2-base <span class="er">(</span><span class="op">&lt;</span>REFERRING_EXPRESSION_SEGMENTATION<span class="op">&gt;</span><span class="kw">)</span><span class="ex">,</span> took 207.0 seconds to </span>
+<span id="cb55-2"><a href="#cb55-2" aria-hidden="true" tabindex="-1"></a><span class="ex">execute</span> each task.</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="region-to-segmentation" class="level3">
+<h3 class="anchored" data-anchor-id="region-to-segmentation">Region to Segmentation</h3>
+<p>With this task, it is also possible to give the object coordinates in the image to segment it. The input format is <code>'&lt;loc_x1&gt;&lt;loc_y1&gt;&lt;loc_x2&gt;&lt;loc_y2&gt;', [x1, y1, x2, y2]</code> , which is the quantized coordinates in [0, 999].</p>
+<p>For example, when running the code:</p>
+<div class="sourceCode" id="cb56"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb56-1"><a href="#cb56-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'</span></span>
+<span id="cb56-2"><a href="#cb56-2" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt, text_input<span class="op">=</span><span class="st">"a half orange"</span>,image<span class="op">=</span>table)</span>
+<span id="cb56-3"><a href="#cb56-3" aria-hidden="true" tabindex="-1"></a>results</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The results were:</p>
+<pre><code>{'&lt;CAPTION_TO_PHRASE_GROUNDING&gt;': {'bboxes': [[343.552001953125,
+    689.6640625,
+    530.9440307617188,
+    873.9840698242188]],
+  'labels': ['a half']}}</code></pre>
+<p>Using the bboxes rounded coordinates:</p>
+<div class="sourceCode" id="cb58"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb58-1"><a href="#cb58-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;REGION_TO_SEGMENTATION&gt;'</span></span>
+<span id="cb58-2"><a href="#cb58-2" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt, </span>
+<span id="cb58-3"><a href="#cb58-3" aria-hidden="true" tabindex="-1"></a>                      text_input<span class="op">=</span><span class="st">"&lt;loc_343&gt;&lt;loc_690&gt;&lt;loc_531&gt;&lt;loc_874&gt;"</span>,</span>
+<span id="cb58-4"><a href="#cb58-4" aria-hidden="true" tabindex="-1"></a>                      image<span class="op">=</span>table)</span>
+<span id="cb58-5"><a href="#cb58-5" aria-hidden="true" tabindex="-1"></a>output_image <span class="op">=</span> copy.deepcopy(table)</span>
+<span id="cb58-6"><a href="#cb58-6" aria-hidden="true" tabindex="-1"></a>draw_polygons(output_image, results[<span class="st">'&lt;REGION_TO_SEGMENTATION&gt;'</span>], fill_mask<span class="op">=</span><span class="va">True</span>)  </span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We got the segmentation of the object on those coordinates (Latency: 83 seconds):</p>
+<p><a href="images/png/orange_seg.png" class="lightbox" data-gallery="quarto-lightbox-gallery-19"><img src="images/png/orange_seg.png" class="img-fluid"></a></p>
+</section>
+<section id="region-to-texts" class="level3">
+<h3 class="anchored" data-anchor-id="region-to-texts">Region to Texts</h3>
+<p>We can also give the region (coordinates and ask for a caption):</p>
+<div class="sourceCode" id="cb59"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb59-1"><a href="#cb59-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;REGION_TO_CATEGORY&gt;'</span></span>
+<span id="cb59-2"><a href="#cb59-2" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt, text_input<span class="op">=</span><span class="st">"&lt;loc_343&gt;&lt;loc_690&gt;&lt;loc_531&gt;</span></span>
+<span id="cb59-3"><a href="#cb59-3" aria-hidden="true" tabindex="-1"></a><span class="er">                      &lt;loc_874&gt;",image=table)</span></span>
+<span id="cb59-4"><a href="#cb59-4" aria-hidden="true" tabindex="-1"></a>results</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<pre><code>[INFO] ==&gt; Florence-2-base (&lt;REGION_TO_CATEGORY&gt;), took 14.3 seconds to execute.</code></pre>
+<div class="sourceCode" id="cb61"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb61-1"><a href="#cb61-1" aria-hidden="true" tabindex="-1"></a><span class="ex">{</span><span class="st">'&lt;REGION_TO_CATEGORY&gt;'</span><span class="ex">:</span> <span class="st">'orange&lt;loc_343&gt;&lt;loc_690&gt;&lt;loc_531&gt;&lt;loc_874&gt;'</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The model identified an orange in that region. Let’s ask for a description:</p>
+<div class="sourceCode" id="cb62"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb62-1"><a href="#cb62-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;REGION_TO_DESCRIPTION&gt;'</span></span>
+<span id="cb62-2"><a href="#cb62-2" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt, text_input<span class="op">=</span><span class="st">"&lt;loc_343&gt;&lt;loc_690&gt;&lt;loc_531&gt;</span></span>
+<span id="cb62-3"><a href="#cb62-3" aria-hidden="true" tabindex="-1"></a><span class="er">                      &lt;loc_874&gt;",image=table)</span></span>
+<span id="cb62-4"><a href="#cb62-4" aria-hidden="true" tabindex="-1"></a>results</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb63"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb63-1"><a href="#cb63-1" aria-hidden="true" tabindex="-1"></a>[INFO] <span class="op">==&gt;</span> Florence<span class="op">-</span><span class="dv">2</span><span class="op">-</span>base (<span class="op">&lt;</span>REGION_TO_CATEGORY<span class="op">&gt;</span>), took <span class="fl">14.6</span> seconds to execute.</span>
+<span id="cb63-2"><a href="#cb63-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb63-3"><a href="#cb63-3" aria-hidden="true" tabindex="-1"></a>{<span class="st">'&lt;REGION_TO_CATEGORY&gt;'</span>: <span class="st">'orange&lt;loc_343&gt;&lt;loc_690&gt;&lt;loc_531&gt;&lt;loc_874&gt;'</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>In this case, the description did not provide more details, but it could. Try another example.</p>
+</section>
+<section id="ocr" class="level3">
+<h3 class="anchored" data-anchor-id="ocr">OCR</h3>
+<p>With Florence-2, we can perform Optical Character Recognition (OCR) on an image, getting what is written on it (<code>task_prompt = '&lt;OCR&gt;'</code> and also get the bounding boxes (location) for the detected text (<code>ask_prompt = '&lt;OCR_WITH_REGION&gt;'</code>). Those tasks can help extract and locate textual information in images, such as reading signs, labels, or other forms of text in images.</p>
+<p>Let’s upload a flyer from a talk in Brazil to Raspi. Let’s test works in another language, here Portuguese):</p>
+<div class="sourceCode" id="cb64"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb64-1"><a href="#cb64-1" aria-hidden="true" tabindex="-1"></a>flayer <span class="op">=</span> Image.<span class="bu">open</span>(<span class="st">'./images/embarcados.jpg'</span>)</span>
+<span id="cb64-2"><a href="#cb64-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Display the image</span></span>
+<span id="cb64-3"><a href="#cb64-3" aria-hidden="true" tabindex="-1"></a>plt.figure(figsize<span class="op">=</span>(<span class="dv">8</span>, <span class="dv">8</span>))</span>
+<span id="cb64-4"><a href="#cb64-4" aria-hidden="true" tabindex="-1"></a>plt.imshow(flayer)</span>
+<span id="cb64-5"><a href="#cb64-5" aria-hidden="true" tabindex="-1"></a>plt.axis(<span class="st">'off'</span>)</span>
+<span id="cb64-6"><a href="#cb64-6" aria-hidden="true" tabindex="-1"></a><span class="co">#plt.title("Image")</span></span>
+<span id="cb64-7"><a href="#cb64-7" aria-hidden="true" tabindex="-1"></a>plt.show()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><a href="images/png/flyer.png" class="lightbox" data-gallery="quarto-lightbox-gallery-20"><img src="images/png/flyer.png" class="img-fluid"></a></p>
+<p>Let’s examine the image with <code>'&lt;MORE_DETAILED_CAPTION&gt;'</code> :</p>
+<div class="sourceCode" id="cb65"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb65-1"><a href="#cb65-1" aria-hidden="true" tabindex="-1"></a><span class="ex">[INFO]</span> ==<span class="op">&gt;</span> Florence-2-base <span class="er">(</span><span class="op">&lt;</span>MORE_DETAILED_CAPTION<span class="op">&gt;</span><span class="kw">)</span><span class="ex">,</span> took 85.2 seconds to execute.</span>
+<span id="cb65-2"><a href="#cb65-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb65-3"><a href="#cb65-3" aria-hidden="true" tabindex="-1"></a><span class="ex">{</span><span class="st">'&lt;MORE_DETAILED_CAPTION&gt;'</span><span class="ex">:</span> <span class="st">'The image is a promotional poster for an event called </span></span>
+<span id="cb65-4"><a href="#cb65-4" aria-hidden="true" tabindex="-1"></a><span class="st">"Machine Learning Embarcados" hosted by Marcelo Roval. The poster has a black background</span></span>
+<span id="cb65-5"><a href="#cb65-5" aria-hidden="true" tabindex="-1"></a><span class="st">with white text. On the left side of the poster, there is a logo of a coffee cup with the</span></span>
+<span id="cb65-6"><a href="#cb65-6" aria-hidden="true" tabindex="-1"></a><span class="st">text "Café Com Embarcados" above it. Below the logo, it says "25 de Setembro as 17th" </span></span>
+<span id="cb65-7"><a href="#cb65-7" aria-hidden="true" tabindex="-1"></a><span class="st">which translates to "25th of September as 17" in English. \n\nOn the right side, there </span></span>
+<span id="cb65-8"><a href="#cb65-8" aria-hidden="true" tabindex="-1"></a><span class="st">aretwo smaller text boxes with the names of the participants and their names. The first </span></span>
+<span id="cb65-9"><a href="#cb65-9" aria-hidden="true" tabindex="-1"></a><span class="st">text box reads "Democratizando a Inteligência Artificial para Paises em Desenvolvimento" </span></span>
+<span id="cb65-10"><a href="#cb65-10" aria-hidden="true" tabindex="-1"></a><span class="st">and the second text box says "Toda quarta-feira" which is Portuguese for "Transmissão via </span></span>
+<span id="cb65-11"><a href="#cb65-11" aria-hidden="true" tabindex="-1"></a><span class="st">in Portuguese".\n\nIn the center of the image, there has a photo of Marcelo, a man with a </span></span>
+<span id="cb65-12"><a href="#cb65-12" aria-hidden="true" tabindex="-1"></a><span class="st">beard and glasses, smiling at the camera. He is wearing a white hard hat and a white </span></span>
+<span id="cb65-13"><a href="#cb65-13" aria-hidden="true" tabindex="-1"></a><span class="st">shirt. The text boxes are in orange and yellow colors.'</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The description is very accurate. Let’s get to the more important words with the task OCR:</p>
+<div class="sourceCode" id="cb66"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb66-1"><a href="#cb66-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;OCR&gt;'</span></span>
+<span id="cb66-2"><a href="#cb66-2" aria-hidden="true" tabindex="-1"></a>run_example(task_prompt,image<span class="op">=</span>flayer)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<pre><code>[INFO] ==&gt; Florence-2-base (&lt;OCR&gt;), took 37.7 seconds to execute.</code></pre>
+<div class="sourceCode" id="cb68"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb68-1"><a href="#cb68-1" aria-hidden="true" tabindex="-1"></a><span class="ex">{</span><span class="st">'&lt;OCR&gt;'</span><span class="ex">:</span> <span class="st">'Machine LearningCafécomEmbarcadoEmbarcadosDemocratizando a </span></span>
+<span id="cb68-2"><a href="#cb68-2" aria-hidden="true" tabindex="-1"></a><span class="st">InteligênciaArtificial para Paises em25 de Setembro ás 17hDesenvolvimentoToda quarta-</span></span>
+<span id="cb68-3"><a href="#cb68-3" aria-hidden="true" tabindex="-1"></a><span class="st">feiraMarcelo RovalProfessor na UNIFIEI eTransmissão viainCo-Director do TinyML4D'</span>}</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Let’s locate the words in the flyer:</p>
+<div class="sourceCode" id="cb69"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb69-1"><a href="#cb69-1" aria-hidden="true" tabindex="-1"></a>task_prompt <span class="op">=</span> <span class="st">'&lt;OCR_WITH_REGION&gt;'</span></span>
+<span id="cb69-2"><a href="#cb69-2" aria-hidden="true" tabindex="-1"></a>results <span class="op">=</span> run_example(task_prompt,image<span class="op">=</span>flayer)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Let’s also create a function to draw bounding boxes around the detected words:</p>
+<div class="sourceCode" id="cb70"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb70-1"><a href="#cb70-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> draw_ocr_bboxes(image, prediction):</span>
+<span id="cb70-2"><a href="#cb70-2" aria-hidden="true" tabindex="-1"></a>    scale <span class="op">=</span> <span class="dv">1</span></span>
+<span id="cb70-3"><a href="#cb70-3" aria-hidden="true" tabindex="-1"></a>    draw <span class="op">=</span> ImageDraw.Draw(image)</span>
+<span id="cb70-4"><a href="#cb70-4" aria-hidden="true" tabindex="-1"></a>    bboxes, labels <span class="op">=</span> prediction[<span class="st">'quad_boxes'</span>], prediction[<span class="st">'labels'</span>]</span>
+<span id="cb70-5"><a href="#cb70-5" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> box, label <span class="kw">in</span> <span class="bu">zip</span>(bboxes, labels):</span>
+<span id="cb70-6"><a href="#cb70-6" aria-hidden="true" tabindex="-1"></a>        color <span class="op">=</span> random.choice(colormap)</span>
+<span id="cb70-7"><a href="#cb70-7" aria-hidden="true" tabindex="-1"></a>        new_box <span class="op">=</span> (np.array(box) <span class="op">*</span> scale).tolist()</span>
+<span id="cb70-8"><a href="#cb70-8" aria-hidden="true" tabindex="-1"></a>        draw.polygon(new_box, width<span class="op">=</span><span class="dv">3</span>, outline<span class="op">=</span>color)</span>
+<span id="cb70-9"><a href="#cb70-9" aria-hidden="true" tabindex="-1"></a>        draw.text((new_box[<span class="dv">0</span>]<span class="op">+</span><span class="dv">8</span>, new_box[<span class="dv">1</span>]<span class="op">+</span><span class="dv">2</span>),</span>
+<span id="cb70-10"><a href="#cb70-10" aria-hidden="true" tabindex="-1"></a>                    <span class="st">"</span><span class="sc">{}</span><span class="st">"</span>.<span class="bu">format</span>(label),</span>
+<span id="cb70-11"><a href="#cb70-11" aria-hidden="true" tabindex="-1"></a>                    align<span class="op">=</span><span class="st">"right"</span>,</span>
+<span id="cb70-12"><a href="#cb70-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb70-13"><a href="#cb70-13" aria-hidden="true" tabindex="-1"></a>                    fill<span class="op">=</span>color)</span>
+<span id="cb70-14"><a href="#cb70-14" aria-hidden="true" tabindex="-1"></a>    display(image)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb71"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb71-1"><a href="#cb71-1" aria-hidden="true" tabindex="-1"></a>output_image <span class="op">=</span> copy.deepcopy(flayer)</span>
+<span id="cb71-2"><a href="#cb71-2" aria-hidden="true" tabindex="-1"></a>draw_ocr_bboxes(output_image, results[<span class="st">'&lt;OCR_WITH_REGION&gt;'</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p><a href="images/png/output_ocr.png" class="lightbox" data-gallery="quarto-lightbox-gallery-21"><img src="images/png/output_ocr.png" class="img-fluid"></a></p>
+<p>We can inspect the detected words:</p>
+<div class="sourceCode" id="cb72"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb72-1"><a href="#cb72-1" aria-hidden="true" tabindex="-1"></a>results[<span class="st">'&lt;OCR_WITH_REGION&gt;'</span>][<span class="st">'labels'</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb73"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb73-1"><a href="#cb73-1" aria-hidden="true" tabindex="-1"></a><span class="st">'&lt;/s&gt;Machine Learning'</span><span class="ex">,</span></span>
+<span id="cb73-2"><a href="#cb73-2" aria-hidden="true" tabindex="-1"></a> <span class="st">'Café'</span><span class="ex">,</span></span>
+<span id="cb73-3"><a href="#cb73-3" aria-hidden="true" tabindex="-1"></a> <span class="st">'com'</span><span class="ex">,</span></span>
+<span id="cb73-4"><a href="#cb73-4" aria-hidden="true" tabindex="-1"></a> <span class="st">'Embarcado'</span><span class="ex">,</span></span>
+<span id="cb73-5"><a href="#cb73-5" aria-hidden="true" tabindex="-1"></a> <span class="st">'Embarcados'</span><span class="ex">,</span></span>
+<span id="cb73-6"><a href="#cb73-6" aria-hidden="true" tabindex="-1"></a> <span class="st">'Democratizando a Inteligência'</span><span class="ex">,</span></span>
+<span id="cb73-7"><a href="#cb73-7" aria-hidden="true" tabindex="-1"></a> <span class="st">'Artificial para Paises em'</span><span class="ex">,</span></span>
+<span id="cb73-8"><a href="#cb73-8" aria-hidden="true" tabindex="-1"></a> <span class="st">'25 de Setembro ás 17h'</span><span class="ex">,</span></span>
+<span id="cb73-9"><a href="#cb73-9" aria-hidden="true" tabindex="-1"></a> <span class="st">'Desenvolvimento'</span><span class="ex">,</span></span>
+<span id="cb73-10"><a href="#cb73-10" aria-hidden="true" tabindex="-1"></a> <span class="st">'Toda quarta-feira'</span><span class="ex">,</span></span>
+<span id="cb73-11"><a href="#cb73-11" aria-hidden="true" tabindex="-1"></a> <span class="st">'Marcelo Roval'</span><span class="ex">,</span></span>
+<span id="cb73-12"><a href="#cb73-12" aria-hidden="true" tabindex="-1"></a> <span class="st">'Professor na UNIFIEI e'</span><span class="ex">,</span></span>
+<span id="cb73-13"><a href="#cb73-13" aria-hidden="true" tabindex="-1"></a> <span class="st">'Transmissão via'</span><span class="ex">,</span></span>
+<span id="cb73-14"><a href="#cb73-14" aria-hidden="true" tabindex="-1"></a> <span class="st">'in'</span><span class="ex">,</span></span>
+<span id="cb73-15"><a href="#cb73-15" aria-hidden="true" tabindex="-1"></a> <span class="st">'Co-Director do TinyML4D'</span><span class="ex">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+</section>
+<section id="latency-summary" class="level2">
+<h2 class="anchored" data-anchor-id="latency-summary">Latency Summary</h2>
+<p>The latency observed for different tasks using Florence-2 on the Raspberry Pi (Raspi-5) varied depending on the complexity of the task:</p>
+<ul>
+<li><strong>Image Captioning</strong>: It took approximately 16-17 seconds to generate a caption for an image.</li>
+<li><strong>Detailed Captioning</strong>: Increased latency to around 25-27 seconds, requiring generating more nuanced scene descriptions.</li>
+<li><strong>More Detailed Captioning:</strong> It took about 32-50 seconds, and the latency increased as the description grew more complex.</li>
+<li><strong>Object Detection:</strong> It took approximately 20-41 seconds, depending on the image’s complexity and the number of detected objects.</li>
+<li><strong>Visual Grounding</strong>: Approximately 15-16 seconds to localize specific objects based on textual prompts.</li>
+<li><strong>OCR (Optical Character Recognition)</strong>: Extracting text from an image took around 37-38 seconds.</li>
+<li><strong>Segmentation and Region to Segmentation</strong>: Segmentation tasks took considerably longer, with a latency of around 83-207 seconds, depending on the complexity and the number of regions to be segmented.</li>
+</ul>
+<p>These latency times highlight the resource constraints of edge devices like the Raspberry Pi and emphasize the need to optimize the model and the environment to achieve real-time performance.</p>
+<p><a href="images/png/htop.png" class="lightbox" data-gallery="quarto-lightbox-gallery-22"><img src="images/png/htop.png" class="img-fluid"></a></p>
+<blockquote class="blockquote">
+<p>Running complex tasks can use all 8GB of the Raspi-5’s memory. For example, the above screenshot during the Florence OD task shows 4 CPUs at full speed and over 5GB of memory in use. Consider increasing the SWAP memory to 2 GB.</p>
+</blockquote>
+<p>Checking the CPU temperature with <code>vcgencmd measure_temp</code> , showed that temperature can go up to +80oC.</p>
+</section>
+<section id="fine-tunning" class="level2">
+<h2 class="anchored" data-anchor-id="fine-tunning">Fine-Tunning</h2>
+<p>As explored in this lab, Florence supports many tasks out of the box, including captioning, object detection, OCR, and more. However, like other pre-trained foundational models, Florence-2 may need domain-specific knowledge. For example, it may need to improve with medical or satellite imagery. In such cases, <strong>fine-tuning</strong> with a custom dataset is necessary. The Roboflow tutorial, <a href="https://blog.roboflow.com/fine-tune-florence-2-object-detection/">How to Fine-tune Florence-2 for Object Detection Tasks</a>, shows how to fine-tune Florence-2 on object detection datasets to improve model performance for our specific use case.</p>
+<p>Based on the above tutorial, it is possible to fine-tune the Florence-2 model to detect boxes and wheels used in previous labs:</p>
+<p><a href="images/png/fine-tuning.png" class="lightbox" data-gallery="quarto-lightbox-gallery-23"><img src="images/png/fine-tuning.png" class="img-fluid"></a></p>
+<p>It is important to note that after fine-tuning, the model can still detect classes that don’t belong to our custom dataset, like cats, dogs, grapes, etc, as seen before).</p>
+<p>The complete fine-tunning project using a previously annotated dataset in Roboflow and executed on CoLab can be found in the notebook:</p>
+<ul>
+<li><a href="https://github.com/Mjrovai/EdgeML-with-Raspberry-Pi/blob/main/FLORENCE-2/notebooks/30-Finetune_florence_2_on_detection_dataset_box_vs_wheel.ipynb">30-Finetune_florence_2_on_detection_dataset_box_vs_wheel.ipynb</a></li>
+</ul>
+<p>In another example, in the post, <a href="https://huggingface.co/blog/finetune-florence2">Fine-tuning Florence-2 - Microsoft’s Cutting-edge Vision Language Models</a>, the authors show an example of fine-tuning Florence on <code>DocVQA</code>. The authors report that Florence 2 can perform visual question answering (VQA), but the released models don’t include VQA capability.</p>
+</section>
+<section id="conclusion" class="level2">
+<h2 class="anchored" data-anchor-id="conclusion">Conclusion</h2>
+<p>Florence-2 offers a versatile and powerful approach to vision-language tasks at the edge, providing performance that rivals larger, task-specific models, such as YOLO for object detection, BERT/RoBERTa for text analysis, and specialized OCR models.</p>
+<p>Thanks to its multi-modal transformer architecture, Florence-2 is more flexible than YOLO in terms of the tasks it can handle. These include object detection, image captioning, and visual grounding.</p>
+<p>Unlike <strong>BERT</strong>, which focuses purely on language, Florence-2 integrates vision and language, allowing it to excel in applications that require both modalities, such as image captioning and visual grounding.</p>
+<p>Moreover, while traditional <strong>OCR models</strong> such as Tesseract and EasyOCR are designed solely for recognizing and extracting text from images, Florence-2’s OCR capabilities are part of a broader framework that includes contextual understanding and visual-text alignment. This makes it particularly useful for scenarios that require both reading text and interpreting its context within images.</p>
+<p>Overall, Florence-2 stands out for its ability to seamlessly integrate various vision-language tasks into a unified model that is efficient enough to run on edge devices like the Raspberry Pi. This makes it a compelling choice for developers and researchers exploring AI applications at the edge.</p>
+<section id="key-advantages-of-florence-2" class="level3">
+<h3 class="anchored" data-anchor-id="key-advantages-of-florence-2">Key Advantages of Florence-2</h3>
+<ol type="1">
+<li><strong>Unified Architecture</strong>
+<ul>
+<li>Single model handles multiple vision tasks vs.&nbsp;specialized models (YOLO, BERT, Tesseract)</li>
+<li>Eliminates the need for multiple model deployments and integrations</li>
+<li>Consistent API and interface across tasks</li>
+</ul></li>
+<li><strong>Performance Comparison</strong>
+<ul>
+<li>Object Detection: Comparable to YOLOv8 (~37.5 mAP on COCO vs.&nbsp;YOLOv8’s ~39.7 mAP) despite being general-purpose</li>
+<li>Text Recognition: Handles multiple languages effectively like specialized OCR models (Tesseract, EasyOCR)</li>
+<li>Language Understanding: Integrates BERT-like capabilities for text processing while adding visual context</li>
+</ul></li>
+<li><strong>Resource Efficiency</strong>
+<ul>
+<li>The Base model (232M parameters) achieves strong results despite smaller size</li>
+<li>Runs effectively on edge devices (Raspberry Pi)</li>
+<li>Single model deployment vs.&nbsp;multiple specialized models</li>
+</ul></li>
+</ol>
+</section>
+<section id="trade-offs" class="level3">
+<h3 class="anchored" data-anchor-id="trade-offs">Trade-offs</h3>
+<ol type="1">
+<li><strong>Performance vs.&nbsp;Specialized Models</strong>
+<ul>
+<li>YOLO series may offer faster inference for pure object detection</li>
+<li>Specialized OCR models might handle complex document layouts better</li>
+<li>BERT/RoBERTa provide deeper language understanding for text-only tasks</li>
+</ul></li>
+<li><strong>Resource Requirements</strong>
+<ul>
+<li>Higher latency on edge devices (15-200s depending on task)</li>
+<li>Requires careful memory management on Raspberry Pi</li>
+<li>It may need optimization for real-time applications</li>
+</ul></li>
+<li><strong>Deployment Considerations</strong>
+<ul>
+<li>Initial setup is more complex than single-purpose models</li>
+<li>Requires understanding of multiple task types and prompts</li>
+<li>The learning curve for optimal prompt engineering</li>
+</ul></li>
+</ol>
+</section>
+<section id="best-use-cases" class="level3">
+<h3 class="anchored" data-anchor-id="best-use-cases">Best Use Cases</h3>
+<ol type="1">
+<li><strong>Resource-Constrained Environments</strong>
+<ul>
+<li>Edge devices requiring multiple vision capabilities</li>
+<li>Systems with limited storage/deployment capacity</li>
+<li>Applications needing flexible vision processing</li>
+</ul></li>
+<li><strong>Multi-modal Applications</strong>
+<ul>
+<li>Content moderation systems</li>
+<li>Accessibility tools</li>
+<li>Document analysis workflows</li>
+</ul></li>
+<li><strong>Rapid Prototyping</strong>
+<ul>
+<li>Quick deployment of vision capabilities</li>
+<li>Testing multiple vision tasks without separate models</li>
+<li>Proof-of-concept development</li>
+</ul></li>
+</ol>
+</section>
+</section>
+<section id="future-implications" class="level2">
+<h2 class="anchored" data-anchor-id="future-implications">Future Implications</h2>
+<p>Florence-2 represents a shift toward unified vision models that could eventually replace task-specific architectures in many applications. While specialized models maintain advantages in specific scenarios, the convenience and efficiency of unified models like Florence-2 make them increasingly attractive for real-world deployments.</p>
+<p>The lab demonstrates Florence-2’s viability on edge devices, suggesting future IoT, mobile computing, and embedded systems applications where deploying multiple specialized models would be impractical.</p>
+</section>
+<section id="resources" class="level2">
+<h2 class="anchored" data-anchor-id="resources">Resources</h2>
+<ul>
+<li><p><a href="https://github.com/Mjrovai/EdgeML-with-Raspberry-Pi/blob/main/FLORENCE-2/notebooks/10-florence2_test.ipynb">10-florence2_test.ipynb</a></p></li>
+<li><p><a href="https://github.com/Mjrovai/EdgeML-with-Raspberry-Pi/blob/main/FLORENCE-2/notebooks/20-florence_2.ipynb">20-florence_2.ipynb</a></p></li>
+<li><p><a href="https://github.com/Mjrovai/EdgeML-with-Raspberry-Pi/blob/main/FLORENCE-2/notebooks/30-Finetune_florence_2_on_detection_dataset_box_vs_wheel.ipynb">30-Finetune_florence_2_on_detection_dataset_box_vs_wheel.ipynb</a></p></li>
+</ul>
+
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const disableStylesheet = (stylesheets) => {
+    for (let i=0; i < stylesheets.length; i++) {
+      const stylesheet = stylesheets[i];
+      stylesheet.rel = 'prefetch';
+    }
+  }
+  const enableStylesheet = (stylesheets) => {
+    for (let i=0; i < stylesheets.length; i++) {
+      const stylesheet = stylesheets[i];
+      stylesheet.rel = 'stylesheet';
+    }
+  }
+  const manageTransitions = (selector, allowTransitions) => {
+    const els = window.document.querySelectorAll(selector);
+    for (let i=0; i < els.length; i++) {
+      const el = els[i];
+      if (allowTransitions) {
+        el.classList.remove('notransition');
+      } else {
+        el.classList.add('notransition');
+      }
+    }
+  }
+  const toggleGiscusIfUsed = (isAlternate, darkModeDefault) => {
+    const baseTheme = document.querySelector('#giscus-base-theme')?.value ?? 'light';
+    const alternateTheme = document.querySelector('#giscus-alt-theme')?.value ?? 'dark';
+    let newTheme = '';
+    if(darkModeDefault) {
+      newTheme = isAlternate ? baseTheme : alternateTheme;
+    } else {
+      newTheme = isAlternate ? alternateTheme : baseTheme;
+    }
+    const changeGiscusTheme = () => {
+      // From: https://github.com/giscus/giscus/issues/336
+      const sendMessage = (message) => {
+        const iframe = document.querySelector('iframe.giscus-frame');
+        if (!iframe) return;
+        iframe.contentWindow.postMessage({ giscus: message }, 'https://giscus.app');
+      }
+      sendMessage({
+        setConfig: {
+          theme: newTheme
+        }
+      });
+    }
+    const isGiscussLoaded = window.document.querySelector('iframe.giscus-frame') !== null;
+    if (isGiscussLoaded) {
+      changeGiscusTheme();
+    }
+  }
+  const toggleColorMode = (alternate) => {
+    // Switch the stylesheets
+    const alternateStylesheets = window.document.querySelectorAll('link.quarto-color-scheme.quarto-color-alternate');
+    manageTransitions('#quarto-margin-sidebar .nav-link', false);
+    if (alternate) {
+      enableStylesheet(alternateStylesheets);
+      for (const sheetNode of alternateStylesheets) {
+        if (sheetNode.id === "quarto-bootstrap") {
+          toggleBodyColorMode(sheetNode);
+        }
+      }
+    } else {
+      disableStylesheet(alternateStylesheets);
+      toggleBodyColorPrimary();
+    }
+    manageTransitions('#quarto-margin-sidebar .nav-link', true);
+    // Switch the toggles
+    const toggles = window.document.querySelectorAll('.quarto-color-scheme-toggle');
+    for (let i=0; i < toggles.length; i++) {
+      const toggle = toggles[i];
+      if (toggle) {
+        if (alternate) {
+          toggle.classList.add("alternate");     
+        } else {
+          toggle.classList.remove("alternate");
+        }
+      }
+    }
+    // Hack to workaround the fact that safari doesn't
+    // properly recolor the scrollbar when toggling (#1455)
+    if (navigator.userAgent.indexOf('Safari') > 0 && navigator.userAgent.indexOf('Chrome') == -1) {
+      manageTransitions("body", false);
+      window.scrollTo(0, 1);
+      setTimeout(() => {
+        window.scrollTo(0, 0);
+        manageTransitions("body", true);
+      }, 40);  
+    }
+  }
+  const isFileUrl = () => { 
+    return window.location.protocol === 'file:';
+  }
+  const hasAlternateSentinel = () => {  
+    let styleSentinel = getColorSchemeSentinel();
+    if (styleSentinel !== null) {
+      return styleSentinel === "alternate";
+    } else {
+      return false;
+    }
+  }
+  const setStyleSentinel = (alternate) => {
+    const value = alternate ? "alternate" : "default";
+    if (!isFileUrl()) {
+      window.localStorage.setItem("quarto-color-scheme", value);
+    } else {
+      localAlternateSentinel = value;
+    }
+  }
+  const getColorSchemeSentinel = () => {
+    if (!isFileUrl()) {
+      const storageValue = window.localStorage.getItem("quarto-color-scheme");
+      return storageValue != null ? storageValue : localAlternateSentinel;
+    } else {
+      return localAlternateSentinel;
+    }
+  }
+  const darkModeDefault = false;
+  let localAlternateSentinel = darkModeDefault ? 'alternate' : 'default';
+  // Dark / light mode switch
+  window.quartoToggleColorScheme = () => {
+    // Read the current dark / light value 
+    let toAlternate = !hasAlternateSentinel();
+    toggleColorMode(toAlternate);
+    setStyleSentinel(toAlternate);
+    toggleGiscusIfUsed(toAlternate, darkModeDefault);
+  };
+  // Ensure there is a toggle, if there isn't float one in the top right
+  if (window.document.querySelector('.quarto-color-scheme-toggle') === null) {
+    const a = window.document.createElement('a');
+    a.classList.add('top-right');
+    a.classList.add('quarto-color-scheme-toggle');
+    a.href = "";
+    a.onclick = function() { try { window.quartoToggleColorScheme(); } catch {} return false; };
+    const i = window.document.createElement("i");
+    i.classList.add('bi');
+    a.appendChild(i);
+    window.document.body.appendChild(a);
+  }
+  // Switch to dark mode if need be
+  if (hasAlternateSentinel()) {
+    toggleColorMode(true);
+  } else {
+    toggleColorMode(false);
+  }
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+          // target, if specified
+          link.setAttribute("target", "_blank");
+          if (link.getAttribute("rel") === null) {
+            link.setAttribute("rel", "noopener");
+          }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+});
+</script>
+<script src="https://giscus.app/client.js" data-repo="harvard-edge/cs249r_book" data-repo-id="R_kgDOKQSOaw" data-category="General" data-category-id="DIC_kwDOKQSOa84CZ8Ry" data-mapping="title" data-reactions-enabled="1" data-emit-metadata="0" data-input-position="top" data-theme="light" data-lang="en" crossorigin="anonymous" async="">
+</script>
+<input type="hidden" id="giscus-base-theme" value="light">
+<input type="hidden" id="giscus-alt-theme" value="dark">
+<nav class="page-navigation">
+  <div class="nav-page nav-page-previous">
+      <a href="../../../../contents/labs/raspi/llm/llm.html" class="pagination-link" aria-label="Small Language Models (SLM)">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text">Small Language Models (SLM)</span>
+      </a>          
+  </div>
+  <div class="nav-page nav-page-next">
+      <a href="../../../../contents/labs/shared/shared.html" class="pagination-link" aria-label="Shared Labs">
+        <span class="nav-page-text">Shared Labs</span> <i class="bi bi-arrow-right-short"></i>
+      </a>
+  </div>
+</nav>
+</div> <!-- /content -->
+<footer class="footer">
+  <div class="nav-footer">
+    <div class="nav-footer-left">
+<p>Written, edited and curated by Prof.&nbsp;Vijay Janapa Reddi (Harvard University)</p>
+</div>   
+    <div class="nav-footer-center">
+      &nbsp;
+    <div class="toc-actions d-sm-block d-md-none"><ul><li><a href="https://github.com/harvard-edge/cs249r_book/edit/dev/contents/labs/raspi/vlm/vlm.qmd" class="toc-action"><i class="bi bi-github"></i>Edit this page</a></li><li><a href="https://github.com/harvard-edge/cs249r_book/issues/new" class="toc-action"><i class="bi empty"></i>Report an issue</a></li><li><a href="https://github.com/harvard-edge/cs249r_book/blob/dev/contents/labs/raspi/vlm/vlm.qmd" class="toc-action"><i class="bi empty"></i>View source</a></li></ul></div></div>
+    <div class="nav-footer-right">
+<p>This book was built with <a href="https://quarto.org/">Quarto</a>.</p>
+</div>
+  </div>
+</footer>
+<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","descPosition":"bottom","loop":false,"openEffect":"zoom","selector":".lightbox"});
+(function() {
+  let previousOnload = window.onload;
+  window.onload = () => {
+    if (previousOnload) {
+      previousOnload();
+    }
+    lightboxQuarto.on('slide_before_load', (data) => {
+      const { slideIndex, slideNode, slideConfig, player, trigger } = data;
+      const href = trigger.getAttribute('href');
+      if (href !== null) {
+        const imgEl = window.document.querySelector(`a[href="${href}"] img`);
+        if (imgEl !== null) {
+          const srcAttr = imgEl.getAttribute("src");
+          if (srcAttr && srcAttr.startsWith("data:")) {
+            slideConfig.href = srcAttr;
+          }
+        }
+      } 
+    });
+  
+    lightboxQuarto.on('slide_after_load', (data) => {
+      const { slideIndex, slideNode, slideConfig, player, trigger } = data;
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(slideNode);
+      }
+    });
+  
+  };
+  
+})();
+          </script>
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/contents/labs/seeed/xiao_esp32s3/image_classification/image_classification.html b/docs/contents/labs/seeed/xiao_esp32s3/image_classification/image_classification.html
index 8d0b8c7..28dfbe4 100644
--- a/docs/contents/labs/seeed/xiao_esp32s3/image_classification/image_classification.html
+++ b/docs/contents/labs/seeed/xiao_esp32s3/image_classification/image_classification.html
@@ -473,6 +473,12 @@
   <a href="../../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/seeed/xiao_esp32s3/kws/kws.html b/docs/contents/labs/seeed/xiao_esp32s3/kws/kws.html
index 4308e96..4774e63 100644
--- a/docs/contents/labs/seeed/xiao_esp32s3/kws/kws.html
+++ b/docs/contents/labs/seeed/xiao_esp32s3/kws/kws.html
@@ -439,6 +439,12 @@
   <a href="../../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/seeed/xiao_esp32s3/motion_classification/motion_classification.html b/docs/contents/labs/seeed/xiao_esp32s3/motion_classification/motion_classification.html
index 6f80f04..11b946d 100644
--- a/docs/contents/labs/seeed/xiao_esp32s3/motion_classification/motion_classification.html
+++ b/docs/contents/labs/seeed/xiao_esp32s3/motion_classification/motion_classification.html
@@ -439,6 +439,12 @@
   <a href="../../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/seeed/xiao_esp32s3/object_detection/object_detection.html b/docs/contents/labs/seeed/xiao_esp32s3/object_detection/object_detection.html
index 444d497..87d7441 100644
--- a/docs/contents/labs/seeed/xiao_esp32s3/object_detection/object_detection.html
+++ b/docs/contents/labs/seeed/xiao_esp32s3/object_detection/object_detection.html
@@ -473,6 +473,12 @@
   <a href="../../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/seeed/xiao_esp32s3/setup/setup.html b/docs/contents/labs/seeed/xiao_esp32s3/setup/setup.html
index a5d063b..9db8f98 100644
--- a/docs/contents/labs/seeed/xiao_esp32s3/setup/setup.html
+++ b/docs/contents/labs/seeed/xiao_esp32s3/setup/setup.html
@@ -473,6 +473,12 @@
   <a href="../../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/seeed/xiao_esp32s3/xiao_esp32s3.html b/docs/contents/labs/seeed/xiao_esp32s3/xiao_esp32s3.html
index b46fa0e..a319fc1 100644
--- a/docs/contents/labs/seeed/xiao_esp32s3/xiao_esp32s3.html
+++ b/docs/contents/labs/seeed/xiao_esp32s3/xiao_esp32s3.html
@@ -439,6 +439,12 @@
   <a href="../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/shared/dsp_spectral_features_block/dsp_spectral_features_block.html b/docs/contents/labs/shared/dsp_spectral_features_block/dsp_spectral_features_block.html
index 81479f1..a39437e 100644
--- a/docs/contents/labs/shared/dsp_spectral_features_block/dsp_spectral_features_block.html
+++ b/docs/contents/labs/shared/dsp_spectral_features_block/dsp_spectral_features_block.html
@@ -472,6 +472,12 @@
   <a href="../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/shared/kws_feature_eng/kws_feature_eng.html b/docs/contents/labs/shared/kws_feature_eng/kws_feature_eng.html
index 7488f9f..de588b2 100644
--- a/docs/contents/labs/shared/kws_feature_eng/kws_feature_eng.html
+++ b/docs/contents/labs/shared/kws_feature_eng/kws_feature_eng.html
@@ -468,6 +468,12 @@
   <a href="../../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/contents/labs/shared/shared.html b/docs/contents/labs/shared/shared.html
index 8be960a..ccda942 100644
--- a/docs/contents/labs/shared/shared.html
+++ b/docs/contents/labs/shared/shared.html
@@ -31,7 +31,7 @@
 <script src="../../../site_libs/quarto-search/quarto-search.js"></script>
 <meta name="quarto:offset" content="../../../">
 <link href="../../../contents/labs/shared/kws_feature_eng/kws_feature_eng.html" rel="next">
-<link href="../../../contents/labs/raspi/llm/llm.html" rel="prev">
+<link href="../../../contents/labs/raspi/vlm/vlm.html" rel="prev">
 <link href="../../../favicon.png" rel="icon" type="image/png">
 <script src="../../../site_libs/quarto-html/quarto.js"></script>
 <script src="../../../site_libs/quarto-html/popper.min.js"></script>
@@ -436,6 +436,12 @@
   <a href="../../../contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
@@ -1056,8 +1062,8 @@ <h1 class="title">Shared Labs</h1>
 <input type="hidden" id="giscus-alt-theme" value="dark">
 <nav class="page-navigation">
   <div class="nav-page nav-page-previous">
-      <a href="../../../contents/labs/raspi/llm/llm.html" class="pagination-link" aria-label="Small Language Models (SLM)">
-        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text">Small Language Models (SLM)</span>
+      <a href="../../../contents/labs/raspi/vlm/vlm.html" class="pagination-link" aria-label="Vision-Language Models (VLM)">
+        <i class="bi bi-arrow-left-short"></i> <span class="nav-page-text">Vision-Language Models (VLM)</span>
       </a>          
   </div>
   <div class="nav-page nav-page-next">
diff --git a/docs/index.html b/docs/index.html
index 58a1bd3..d4432ee 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -440,6 +440,12 @@
   <a href="./contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/references.html b/docs/references.html
index 712b7ef..0c26175 100644
--- a/docs/references.html
+++ b/docs/references.html
@@ -453,6 +453,12 @@
   <a href="./contents/labs/raspi/llm/llm.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Small Language Models (SLM)</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./contents/labs/raspi/vlm/vlm.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Vision-Language Models (VLM)</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/search.json b/docs/search.json
index 33cced3..2b71311 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -2397,7 +2397,7 @@
     "href": "contents/labs/overview.html#supported-devices",
     "title": "Overview",
     "section": "Supported Devices",
-    "text": "Supported Devices\nWe have included laboratory materials for three key devices that represent different hardware profiles and capabilities.\n\nNicla Vision: Optimized for vision-based applications like image classification and object detection, ideal for compact, low-power use cases.\nXIAO ESP32S3: A versatile, compact board suitable for keyword spotting and motion detection tasks.\nRaspberry Pi: A flexible platform for more computationally intensive tasks, including small language models and various classification and detection applications.\n\n\n\n\nExercise\nNicla Vision\nXIAO ESP32S3\nRaspberry Pi\n\n\n\n\nInstallation & Setup\n✓\n✓\n✓\n\n\nKeyword Spotting (KWS)\n✓\n✓\n\n\n\nImage Classification\n✓\n✓\n✓\n\n\nObject Detection\n✓\n✓\n✓\n\n\nMotion Detection\n✓\n✓\n\n\n\nSmall Language Models (SLM)\n\n\n✓",
+    "text": "Supported Devices\nWe have included laboratory materials for three key devices that represent different hardware profiles and capabilities.\n\nNicla Vision: Optimized for vision-based applications like image classification and object detection, ideal for compact, low-power use cases.\nXIAO ESP32S3: A versatile, compact board suitable for keyword spotting and motion detection tasks.\nRaspberry Pi: A flexible platform for more computationally intensive tasks, including small language models and various classification and detection applications.\n\n\n\n\nExercise\nNicla Vision\nXIAO ESP32S3\nRaspberry Pi\n\n\n\n\nInstallation & Setup\n✓\n✓\n✓\n\n\nKeyword Spotting (KWS)\n✓\n✓\n\n\n\nImage Classification\n✓\n✓\n✓\n\n\nObject Detection\n✓\n✓\n✓\n\n\nMotion Detection\n✓\n✓\n\n\n\nSmall Language Models (SLM)\n\n\n✓\n\n\nVision Language Models (VLM)\n\n\n✓",
     "crumbs": [
       "LABS",
       "Overview"
@@ -3763,7 +3763,7 @@
     "href": "contents/labs/raspi/raspi.html#pre-requisites",
     "title": "Raspberry Pi",
     "section": "",
-    "text": "Raspberry Pi: Ensure you have at least one of the boards: the Raspberry Pi Zero 2W, Raspberry Pi 4 or 5 for the Vision Labs, and the Raspberry 5 for the GenAi lab.\nPower Adapter: To Power on the boards.\n\nRaspberry Pi Zero 2-W: 2.5W with a Micro-USB adapter\nRaspberry Pi 4 or 5: 3.5W with a USB-C adapter\n\nNetwork: With internet access for downloading the necessary software and controlling the boards remotely.\nSD Card (32GB minimum) and an SD card Adapter: For the Raspberry Pi OS.",
+    "text": "Raspberry Pi: Ensure you have at least one of the boards: the Raspberry Pi Zero 2W, Raspberry Pi 4 or 5 for the Vision Labs, and the Raspberry 5 for the GenAi labs.\nPower Adapter: To Power on the boards.\n\nRaspberry Pi Zero 2-W: 2.5W with a Micro-USB adapter\nRaspberry Pi 4 or 5: 3.5W with a USB-C adapter\n\nNetwork: With internet access for downloading the necessary software and controlling the boards remotely.\nSD Card (32GB minimum) and an SD card Adapter: For the Raspberry Pi OS.",
     "crumbs": [
       "Raspberry Pi"
     ]
@@ -3783,7 +3783,7 @@
     "href": "contents/labs/raspi/raspi.html#exercises",
     "title": "Raspberry Pi",
     "section": "Exercises",
-    "text": "Exercises\n\n\n\nModality\nTask\nDescription\nLink\n\n\n\n\nVision\nImage Classification\nLearn to classify images\nLink\n\n\nVision\nObject Detection\nImplement object detection\nLink\n\n\nGenAI\nSmall Language Models\nDeploy SLMs at the Edge\nLink",
+    "text": "Exercises\n\n\n\nModality\nTask\nDescription\nLink\n\n\n\n\nVision\nImage Classification\nLearn to classify images\nLink\n\n\nVision\nObject Detection\nImplement object detection\nLink\n\n\nGenAI\nSmall Language Models\nDeploy SLMs at the Edge\nLink\n\n\nGenAI\nVisual-Language Models\nDeploy VLMs at the Edge\nLink",
     "crumbs": [
       "Raspberry Pi"
     ]
@@ -4217,6 +4217,127 @@
       "Small Language Models (SLM)"
     ]
   },
+  {
+    "objectID": "contents/labs/raspi/vlm/vlm.html",
+    "href": "contents/labs/raspi/vlm/vlm.html",
+    "title": "Vision-Language Models (VLM)",
+    "section": "",
+    "text": "Introduction\nIn this hands-on lab, we will continuously explore AI applications at the Edge, from the basic setup of the Florence-2, Microsoft’s state-of-the-art vision foundation model, to advanced implementations on devices like the Raspberry Pi. We will learn to use Vision-Languageor Models (VLMs) for tasks such as captioning, object detection, grounding, segmentation, and OCR on a Raspberry Pi.",
+    "crumbs": [
+      "Raspberry Pi",
+      "Vision-Language Models (VLM)"
+    ]
+  },
+  {
+    "objectID": "contents/labs/raspi/vlm/vlm.html#introduction",
+    "href": "contents/labs/raspi/vlm/vlm.html#introduction",
+    "title": "Vision-Language Models (VLM)",
+    "section": "",
+    "text": "Why Florence-2 at the Edge?\nFlorence-2 is a vision-language model open-sourced by Microsoft under the MIT license, which significantly advances vision-language models by combining a lightweight architecture with robust capabilities. Thanks to its training on the massive FLD-5B dataset, which contains 126 million images and 5.4 billion visual annotations, it achieves performance comparable to larger models. This makes Florence-2 ideal for deployment at the edge, where power and computational resources are limited.\nIn this tutorial, we will explore how to use Florence-2 for real-time computer vision applications, such as:\n\nImage captioning\nObject detection\nSegmentation\nVisual grounding\n\n\nVisual grounding involves linking textual descriptions to specific regions within an image. This enables the model to understand where particular objects or entities described in a prompt are in the image. For example, if the prompt is “a red car,” the model will identify and highlight the region where the red car is found in the image. Visual grounding is helpful for applications where precise alignment between text and visual content is needed, such as human-computer interaction, image annotation, and interactive AI systems.\n\nIn the tutorial, we will walk through:\n\nSetting up Florence-2 on the Raspberry Pi\nRunning inference tasks such as object detection and captioning\nOptimizing the model to get the best performance from the edge device\nExploring practical, real-world applications with fine-tuning.\n\n\n\nFlorence-2 Model Architecture\nFlorence-2 utilizes a unified, prompt-based representation to handle various vision-language tasks. The model architecture consists of two main components: an image encoder and a multi-modal transformer encoder-decoder.\n\n\nImage Encoder: The image encoder is based on the DaViT (Dual Attention Vision Transformers) architecture. It converts input images into a series of visual token embeddings. These embeddings serve as the foundational representations of the visual content, capturing both spatial and contextual information about the image.\nMulti-Modal Transformer Encoder-Decoder: Florence-2’s core is the multi-modal transformer encoder-decoder, which combines visual token embeddings from the image encoder with textual embeddings generated by a BERT-like model. This combination allows the model to simultaneously process visual and textual inputs, enabling a unified approach to tasks such as image captioning, object detection, and segmentation.\n\nThe model’s training on the extensive FLD-5B dataset ensures it can effectively handle diverse vision tasks without requiring task-specific modifications. Florence-2 uses textual prompts to activate specific tasks, making it highly flexible and capable of zero-shot generalization. For tasks like object detection or visual grounding, the model incorporates additional location tokens to represent regions within the image, ensuring a precise understanding of spatial relationships.\n\nFlorence-2’s compact architecture and innovative training approach allow it to perform computer vision tasks accurately, even on resource-constrained devices like the Raspberry Pi.",
+    "crumbs": [
+      "Raspberry Pi",
+      "Vision-Language Models (VLM)"
+    ]
+  },
+  {
+    "objectID": "contents/labs/raspi/vlm/vlm.html#technical-overview",
+    "href": "contents/labs/raspi/vlm/vlm.html#technical-overview",
+    "title": "Vision-Language Models (VLM)",
+    "section": "Technical Overview",
+    "text": "Technical Overview\nFlorence-2 introduces several innovative features that set it apart:\n\nArchitecture\n\n\nLightweight Design: Two variants available\n\nFlorence-2-Base: 232 million parameters\nFlorence-2-Large: 771 million parameters\n\nUnified Representation: Handles multiple vision tasks through a single architecture\nDaViT Vision Encoder: Converts images into visual token embeddings\nTransformer-based Multi-modal Encoder-Decoder: Processes combined visual and text embeddings\n\n\n\nTraining Dataset (FLD-5B)\n\n\n126 million unique images\n5.4 billion comprehensive annotations, including:\n\n500M text annotations\n1.3B region-text annotations\n3.6B text-phrase-region annotations\n\nAutomated annotation pipeline using specialist models\nIterative refinement process for high-quality labels\n\n\n\nKey Capabilities\nFlorence-2 excels in multiple vision tasks:\n\nZero-shot Performance\n\nImage Captioning: Achieves 135.6 CIDEr score on COCO\nVisual Grounding: 84.4% recall@1 on Flickr30k\nObject Detection: 37.5 mAP on COCO val2017\nReferring Expression: 67.0% accuracy on RefCOCO\n\n\n\nFine-tuned Performance\n\nCompetitive with specialist models despite the smaller size\nOutperforms larger models in specific benchmarks\nEfficient adaptation to new tasks\n\n\n\n\nPractical Applications\nFlorence-2 can be applied across various domains:\n\nContent Understanding\n\nAutomated image captioning for accessibility\nVisual content moderation\nMedia asset management\n\nE-commerce\n\nProduct image analysis\nVisual search\nAutomated product tagging\n\nHealthcare\n\nMedical image analysis\nDiagnostic assistance\nResearch data processing\n\nSecurity & Surveillance\n\nObject detection and tracking\nAnomaly detection\nScene understanding\n\n\n\n\nComparing Florence-2 with other VLMs\nFlorence-2 stands out from other visual language models due to its impressive zero-shot capabilities. Unlike models like Google PaliGemma, which rely on extensive fine-tuning to adapt to various tasks, Florence-2 works right out of the box, as we will see in this lab. It can also compete with larger models like GPT-4V and Flamingo, which often have many more parameters but only sometimes match Florence-2’s performance. For example, Florence-2 achieves better zero-shot results than Kosmos-2 despite having over twice the parameters.\nIn benchmark tests, Florence-2 has shown remarkable performance in tasks like COCO captioning and referring expression comprehension. It outperformed models like PolyFormer and UNINEXT in object detection and segmentation tasks on the COCO dataset. It is a highly competitive choice for real-world applications where both performance and resource efficiency are crucial.",
+    "crumbs": [
+      "Raspberry Pi",
+      "Vision-Language Models (VLM)"
+    ]
+  },
+  {
+    "objectID": "contents/labs/raspi/vlm/vlm.html#setup-and-installation",
+    "href": "contents/labs/raspi/vlm/vlm.html#setup-and-installation",
+    "title": "Vision-Language Models (VLM)",
+    "section": "Setup and Installation",
+    "text": "Setup and Installation\nOur choice of edge device is the Raspberry Pi 5 (Raspi-5). Its robust platform is equipped with the Broadcom BCM2712, a 2.4GHz quad-core 64-bit Arm Cortex-A76 CPU featuring Cryptographic Extension and enhanced caching capabilities. It boasts a VideoCore VII GPU, dual 4Kp60 HDMI® outputs with HDR, and a 4Kp60 HEVC decoder. Memory options include 4GB and 8GB of high-speed LPDDR4X SDRAM, with 8GB being our choice to run Florence-2. It also features expandable storage via a microSD card slot and a PCIe 2.0 interface for fast peripherals such as M.2 SSDs (Solid State Drives).\n\nFor real applications, SSDs are a better option than SD cards.\n\nWe suggest installing an Active Cooler, a dedicated clip-on cooling solution for Raspberry Pi 5 (Raspi-5), for this lab. It combines an aluminum heatsink with a temperature-controlled blower fan to keep the Raspi-5 operating comfortably under heavy loads, such as running Florense-2.\n\n\nEnvironment configuration\nTo run Microsoft Florense-2 on the Raspberry Pi 5, we’ll need a few libraries:\n\nTransformers:\n\nFlorence-2 uses the transformers library from Hugging Face for model loading and inference. This library provides the architecture for working with pre-trained vision-language models, making it easy to perform tasks like image captioning, object detection, and more. Essentially, transformers helps in interacting with the model, processing input prompts, and obtaining outputs.\n\nPyTorch:\n\nPyTorch is a deep learning framework that provides the infrastructure needed to run the Florence-2 model, which includes tensor operations, GPU acceleration (if a GPU is available), and model training/inference functionalities. The Florence-2 model is trained in PyTorch, and we need it to leverage its functions, layers, and computation capabilities to perform inferences on the Raspberry Pi.\n\nTimm (PyTorch Image Models):\n\nFlorence-2 uses timm to access efficient implementations of vision models and pre-trained weights. Specifically, the timm library is utilized for the image encoder part of Florence-2, particularly for managing the DaViT architecture. It provides model definitions and optimized code for common vision tasks and allows the easy integration of different backbones that are lightweight and suitable for edge devices.\n\nEinops:\n\nEinops is a library for flexible and powerful tensor operations. It makes it easy to reshape and manipulate tensor dimensions, which is especially important for the multi-modal processing done in Florence-2. Vision-language models like Florence-2 often need to rearrange image data, text embeddings, and visual embeddings to align correctly for the transformer blocks, and einops simplifies these complex operations, making the code more readable and concise.\n\n\nIn short, these libraries enable different essential components of Florence-2:\n\nTransformers and PyTorch are needed to load the model and run the inference.\nTimm is used to access and efficiently implement the vision encoder.\nEinops helps reshape data, facilitating the integration of visual and text features.\n\nAll these components work together to help Florence-2 run seamlessly on our Raspberry Pi, allowing it to perform complex vision-language tasks relatively quickly.\nConsidering that the Raspberry Pi already has its OS installed, let’s use SSH to reach it from another computer:\nssh mjrovai@raspi-5.local\nAnd check the IP allocated to it:\nhostname -I\n192.168.4.209\n\nUpdating the Raspberry Pi\nFirst, ensure your Raspberry Pi is up to date:\nsudo apt update\nsudo apt upgrade -y\nInitial setup for using PIP:\nsudo apt install python3-pip\nsudo rm /usr/lib/python3.11/EXTERNALLY-MANAGED\npip3 install --upgrade pip\nInstall Dependencies\nsudo apt-get install libjpeg-dev libopenblas-dev libopenmpi-dev libomp-dev\nLet’s set up and activate a Virtual Environment for working with Florence-2:\npython3 -m venv ~/florence\nsource ~/florence/bin/activate\nInstall PyTorch\npip3 install setuptools numpy Cython\npip3 install requests\npip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu\npip3 install torchaudio --index-url https://download.pytorch.org/whl/cpu\nLet’s verify that PyTorch is correctly installed:\n\nInstall Transformers, Timm and Einops:\npip3 install transformers\npip3 install timm einops\nInstall the model:\npip3 install autodistill-florence-2\nJupyter Notebook and Python libraries\nInstalling a Jupyter Notebook to run and test our Python scripts is possible.\npip3 install jupyter\npip3 install numpy Pillow matplotlib\njupyter notebook --generate-config\n\n\nTesting the installation\nRunning the Jupyter Notebook on the remote computer\njupyter notebook --ip=192.168.4.209 --no-browser\nRunning the above command on the SSH terminal, we can see the local URL address to open the notebook:\n\nThe notebook with the code used on this initial test can be found on the Lab GitHub:\n\n10-florence2_test.ipynb\n\nWe can access it on the remote computer by entering the Raspberry Pi’s IP address and the provided token in a web browser ( copy the entire URL from the terminal).\nFrom the Home page, create a new notebook [Python 3 (ipykernel) ] and copy and paste the example code from Hugging Face Hub.\nThe code is designed to run Florence-2 on a given image to perform object detection. It loads the model, processes an image and a prompt, and then generates a response to identify and describe the objects in the image.\n\nThe processor helps prepare text and image inputs.\nThe model takes the processed inputs to generate a meaningful response.\nThe post-processing step refines the generated output into a more interpretable form, like bounding boxes for detected objects.\n\n\nThis workflow leverages the versatility of Florence-2 to handle vision-language tasks and is implemented efficiently using PyTorch, Transformers, and related image-processing tools.\n\nimport requests\nfrom PIL import Image\nimport torch\nfrom transformers import AutoProcessor, AutoModelForCausalLM \n\ndevice = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\ntorch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n\nmodel = AutoModelForCausalLM.from_pretrained(\"microsoft/Florence-2-base\",\n                                             torch_dtype=torch_dtype, \n                                             trust_remote_code=True).to(device)\nprocessor = AutoProcessor.from_pretrained(\"microsoft/Florence-2-base\", \n                                          trust_remote_code=True)\n\nprompt = \"&lt;OD&gt;\"\n\nurl = \"https://huggingface.co/datasets/huggingface/documentation-\nimages/resolve/main/transformers/tasks/car.jpg?download=true\"\nimage = Image.open(requests.get(url, stream=True).raw)\n\ninputs = processor(text=prompt, images=image, return_tensors=\"pt\").to(\n  device, torch_dtype)\n\ngenerated_ids = model.generate(\n    input_ids=inputs[\"input_ids\"],\n    pixel_values=inputs[\"pixel_values\"],\n    max_new_tokens=1024,\n    do_sample=False,\n    num_beams=3,\n)\ngenerated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]\n\nparsed_answer = processor.post_process_generation(generated_text, task=\"&lt;OD&gt;\", \n                                                  image_size=(image.width, \n                                                              image.height))\n\nprint(parsed_answer)\nLet’s break down the provided code step by step:\n\n1. Importing Required Libraries\nimport requests\nfrom PIL import Image\nimport torch\nfrom transformers import AutoProcessor, AutoModelForCausalLM \n\nrequests: Used to make HTTP requests. In this case, it downloads an image from a URL.\nPIL (Pillow): Provides tools for manipulating images. Here, it’s used to open the downloaded image.\ntorch: PyTorch is imported to handle tensor operations and determine the hardware availability (CPU or GPU).\ntransformers: This module provides easy access to Florence-2 by using AutoProcessor and AutoModelForCausalLM to load pre-trained models and process inputs.\n\n\n\n2. Determining the Device and Data Type\ndevice = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\ntorch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n\nDevice Setup: The code checks if a CUDA-enabled GPU is available (torch.cuda.is_available()). The device is set to “cuda:0” if a GPU is available. Otherwise, it defaults to \"cpu\" (our case here).\nData Type Setup: If a GPU is available, torch.float16 is chosen, which uses half-precision floats to speed up processing and reduce memory usage. On the CPU, it defaults to torch.float32 to maintain compatibility.\n\n\n\n3. Loading the Model and Processor\nmodel = AutoModelForCausalLM.from_pretrained(\"microsoft/Florence-2-base\", \n                                             torch_dtype=torch_dtype,\n                                             trust_remote_code=True).to(device)\nprocessor = AutoProcessor.from_pretrained(\"microsoft/Florence-2-base\",\n                                          trust_remote_code=True)\n\nModel Initialization:\n\nAutoModelForCausalLM.from_pretrained() loads the pre-trained Florence-2 model from Microsoft’s repository on Hugging Face. The torch_dtype is set according to the available hardware (GPU/CPU), and trust_remote_code=True allows the use of any custom code that might be provided with the model.\n.to(device) moves the model to the appropriate device (either CPU or GPU). In our case, it will be set to CPU.\n\nProcessor Initialization:\n\nAutoProcessor.from_pretrained() loads the processor for Florence-2. The processor is responsible for transforming text and image inputs into a format the model can work with (e.g., encoding text, normalizing images, etc.).\n\n\n\n\n\n4. Defining the Prompt\nprompt = \"&lt;OD&gt;\"\n\nPrompt Definition: The string \"&lt;OD&gt;\" is used as a prompt. This refers to “Object Detection”, instructing the model to detect objects on the image.\n\n\n5. Downloading and Loading the Image\nurl = \"https://huggingface.co/datasets/huggingface/documentation-\\\nimages/resolve/main/transformers/tasks/car.jpg?download=true\"\nimage = Image.open(requests.get(url, stream=True).raw)\n\nDownloading the Image: The requests.get() function fetches the image from the specified URL. The stream=True parameter ensures the image is streamed rather than downloaded completely at once.\nOpening the Image: Image.open() opens the image so the model can process it.\n\n\n\n6. Processing Inputs\ninputs = processor(text=prompt, images=image, return_tensors=\"pt\").to(device, \n                                                                      torch_dtype)\n\nProcessing Input Data: The processor() function processes the text (prompt) and the image (image). The return_tensors=\"pt\" argument converts the processed data into PyTorch tensors, which are necessary for inputting data into the model.\nMoving Inputs to Device: .to(device, torch_dtype) moves the inputs to the correct device (CPU or GPU) and assigns the appropriate data type.\n\n\n\n\n7. Generating the Output\ngenerated_ids = model.generate(\n    input_ids=inputs[\"input_ids\"],\n    pixel_values=inputs[\"pixel_values\"],\n    max_new_tokens=1024,\n    do_sample=False,\n    num_beams=3,\n)\n\nModel Generation: model.generate() is used to generate the output based on the input data.\n\ninput_ids: Represents the tokenized form of the prompt.\npixel_values: Contains the processed image data.\nmax_new_tokens=1024: Specifies the maximum number of new tokens to be generated in the response. This limits the response length.\ndo_sample=False: Disables sampling; instead, the generation uses deterministic methods (beam search).\nnum_beams=3: Enables beam search with three beams, which improves output quality by considering multiple possibilities during generation.\n\n\n\n8. Decoding the Generated Text\ngenerated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]\n\nBatch Decode: processor.batch_decode() decodes the generated IDs (tokens) into readable text. The skip_special_tokens=False parameter means that the output will include any special tokens that may be part of the response.\n\n\n\n9. Post-processing the Generation\nparsed_answer = processor.post_process_generation(generated_text, task=\"&lt;OD&gt;\", \n                                                  image_size=(image.width, \n                                                              image.height))\n\nPost-Processing: processor.post_process_generation() is called to process the generated text further, interpreting it based on the task (\"&lt;OD&gt;\" for object detection) and the size of the image.\nThis function extracts specific information from the generated text, such as bounding boxes for detected objects, making the output more useful for visual tasks.\n\n\n\n10. Printing the Output\nprint(parsed_answer)\n\nFinally, print(parsed_answer) displays the output, which could include object detection results, such as bounding box coordinates and labels for the detected objects in the image.\n\n\n\nResult\nRunning the code, we get as the Parsed Answer:\n{'&lt;OD&gt;': {'bboxes': [[34.23999786376953, 160.0800018310547, 597.4400024414062, \n371.7599792480469], [272.32000732421875, 241.67999267578125, 303.67999267578125, \n247.4399871826172], [454.0799865722656, 276.7200012207031, 553.9199829101562, \n370.79998779296875], [96.31999969482422, 280.55999755859375, 198.0800018310547, \n371.2799987792969]], 'labels': ['car', 'door handle', 'wheel', 'wheel']}}\nFirst, Let’s inspect the image:\nimport matplotlib.pyplot as plt\nplt.figure(figsize=(8, 8))\nplt.imshow(image)\nplt.axis('off')\nplt.show()\n\nBy the Object Detection result, we can see that:\n'labels': ['car', 'door handle', 'wheel', 'wheel']\nIt seems that at least a few objects were detected. we can also implement a code to draw the bounding boxes in the find objects:\ndef plot_bbox(image, data):\n   # Create a figure and axes\n    fig, ax = plt.subplots()\n\n    # Display the image\n    ax.imshow(image)\n\n    # Plot each bounding box\n    for bbox, label in zip(data['bboxes'], data['labels']):\n        # Unpack the bounding box coordinates\n        x1, y1, x2, y2 = bbox\n        # Create a Rectangle patch\n        rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=1, \n                                 edgecolor='r', facecolor='none')\n        # Add the rectangle to the Axes\n        ax.add_patch(rect)\n        # Annotate the label\n        plt.text(x1, y1, label, color='white', fontsize=8, \n                 bbox=dict(facecolor='red', alpha=0.5))\n\n    # Remove the axis ticks and labels\n    ax.axis('off')\n\n    # Show the plot\n    plt.show()\n\nBox (x0, y0, x1, y1): Location tokens correspond to the top-left and bottom-right corners of a box.\n\nAnd running\nplot_bbox(image, parsed_answer['&lt;OD&gt;'])\nWe get:",
+    "crumbs": [
+      "Raspberry Pi",
+      "Vision-Language Models (VLM)"
+    ]
+  },
+  {
+    "objectID": "contents/labs/raspi/vlm/vlm.html#florence-2-tasks",
+    "href": "contents/labs/raspi/vlm/vlm.html#florence-2-tasks",
+    "title": "Vision-Language Models (VLM)",
+    "section": "Florence-2 Tasks",
+    "text": "Florence-2 Tasks\nFlorence-2 is designed to perform a variety of computer vision and vision-language tasks through prompts. These tasks can be activated by providing a specific textual prompt to the model, as we saw with &lt;OD&gt; (Object Detection).\nFlorence-2’s versatility comes from combining these prompts, allowing us to guide the model’s behavior to perform specific vision tasks. Changing the prompt allows us to adapt Florence-2 to different tasks without needing task-specific modifications in the architecture. This capability directly results from Florence-2’s unified model architecture and large-scale multi-task training on the FLD-5B dataset.\nHere are some of the key tasks that Florence-2 can perform, along with example prompts:\n\n1. Object Detection (OD)\n\nPrompt: \"&lt;OD&gt;\"\nDescription: Identifies objects in an image and provides bounding boxes for each detected object. This task is helpful for applications like visual inspection, surveillance, and general object recognition.\n\n\n\n2. Image Captioning\n\nPrompt: \"&lt;CAPTION&gt;\"\nDescription: Generates a textual description for an input image. This task helps the model describe what is happening in the image, providing a human-readable caption for content understanding.\n\n\n\n3. Detailed Captioning\n\nPrompt: \"&lt;DETAILED_CAPTION&gt;\"\nDescription: Generates a more detailed caption with more nuanced information about the scene, such as the objects present and their relationships.\n\n\n\n4. Visual Grounding\n\nPrompt: \"&lt;CAPTION_TO_PHRASE_GROUNDING&gt;\"\nDescription: Links a textual description to specific regions in an image. For example, given a prompt like “a green car,” the model highlights where the red car is in the image. This is useful for human-computer interaction, where you must find specific objects based on text.\n\n\n\n5. Segmentation\n\nPrompt: \"&lt;REFERRING_EXPRESSION_SEGMENTATION&gt;\"\nDescription: Performs segmentation based on a referring expression, such as “the blue cup.” The model identifies and segments the specific region containing the object mentioned in the prompt (all related pixels).\n\n\n\n6. Dense Region Captioning\n\nPrompt: \"&lt;DENSE_REGION_CAPTION&gt;\"\nDescription: Provides captions for multiple regions within an image, offering a detailed breakdown of all visible areas, including different objects and their relationships.\n\n\n\n7. OCR with Region\n\nPrompt: \"&lt;OCR_WITH_REGION&gt;\"\nDescription: Performs Optical Character Recognition (OCR) on an image and provides bounding boxes for the detected text. This is useful for extracting and locating textual information in images, such as reading signs, labels, or other forms of text in images.\n\n\n\n8. Phrase Grounding for Specific Expressions\n\nPrompt: \"&lt;CAPTION_TO_PHRASE_GROUNDING&gt;\" along with a specific expression, such as \"a wine glass\".\nDescription: Locates the area in the image that corresponds to a specific textual phrase. This task allows for identifying particular objects or elements when prompted with a word or keyword.\n\n\n\n9. Open Vocabulary Object Detection\n\nPrompt: \"&lt;OPEN_VOCABULARY_OD&gt;\"\nDescription: The model can detect objects without being restricted to a predefined list of classes, making it helpful in recognizing a broader range of items based on general visual understanding.",
+    "crumbs": [
+      "Raspberry Pi",
+      "Vision-Language Models (VLM)"
+    ]
+  },
+  {
+    "objectID": "contents/labs/raspi/vlm/vlm.html#exploring-computer-vision-and-vision-language-tasks",
+    "href": "contents/labs/raspi/vlm/vlm.html#exploring-computer-vision-and-vision-language-tasks",
+    "title": "Vision-Language Models (VLM)",
+    "section": "Exploring computer vision and vision-language tasks",
+    "text": "Exploring computer vision and vision-language tasks\nFor exploration, all codes can be found on the GitHub:\n\n20-florence_2.ipynb\n\nLet’s use a couple of images created by Dall-E and upload them to the Rasp-5 (FileZilla can be used for that). The images will be saved on a sub-folder named images :\ndogs_cats = Image.open('./images/dogs-cats.jpg')\ntable = Image.open('./images/table.jpg')\n\nLet’s create a function to facilitate our exploration and to keep track of the latency of the model for different tasks:\ndef run_example(task_prompt, text_input=None, image=None):\n    start_time = time.perf_counter()  # Start timing\n    if text_input is None:\n        prompt = task_prompt\n    else:\n        prompt = task_prompt + text_input\n    inputs = processor(text=prompt, images=image, \n                       return_tensors=\"pt\").to(device)\n    generated_ids = model.generate(\n      input_ids=inputs[\"input_ids\"],\n      pixel_values=inputs[\"pixel_values\"],\n      max_new_tokens=1024,\n      early_stopping=False,\n      do_sample=False,\n      num_beams=3,\n    )\n    generated_text = processor.batch_decode(generated_ids, \n                                            skip_special_tokens=False)[0]\n    parsed_answer = processor.post_process_generation(\n        generated_text,\n        task=task_prompt,\n        image_size=(image.width, image.height)\n    )\n    \n    end_time = time.perf_counter()  # End timing\n    elapsed_time = end_time - start_time  # Calculate elapsed time\n    print(f\" \\n[INFO] ==&gt; Florence-2-base ({task_prompt}), \n          took {elapsed_time:.1f} seconds to execute.\\n\")\n    \n    return parsed_answer\n\nCaption\n1. Dogs and Cats\nrun_example(task_prompt='&lt;CAPTION&gt;',image=dogs_cats)\n[INFO] ==&gt; Florence-2-base (&lt;CAPTION&gt;), took 16.1 seconds to execute.\n\n{'&lt;CAPTION&gt;': 'A group of dogs and cats sitting in a garden.'}\n2. Table\nrun_example(task_prompt='&lt;CAPTION&gt;',image=table)\n[INFO] ==&gt; Florence-2-base (&lt;CAPTION&gt;), took 16.5 seconds to execute.\n\n{'&lt;CAPTION&gt;': 'A wooden table topped with a plate of fruit and a glass of wine.'}\n\n\nDETAILED_CAPTION\n1. Dogs and Cats\nrun_example(task_prompt='&lt;DETAILED_CAPTION&gt;',image=dogs_cats)\n[INFO] ==&gt; Florence-2-base (&lt;DETAILED_CAPTION&gt;), took 25.5 seconds to execute.\n\n{'&lt;DETAILED_CAPTION&gt;': 'The image shows a group of cats and dogs sitting on top of a\nlush green field, surrounded by plants with flowers, trees, and a house in the \nbackground. The sky is visible above them, creating a peaceful atmosphere.'}\n2. Table\nrun_example(task_prompt='&lt;DETAILED_CAPTION&gt;',image=table)\n[INFO] ==&gt; Florence-2-base (&lt;DETAILED_CAPTION&gt;), took 26.8 seconds to execute.\n\n{'&lt;DETAILED_CAPTION&gt;': 'The image shows a wooden table with a bottle of wine and a \nglass of wine on it, surrounded by a variety of fruits such as apples, oranges, and \ngrapes. In the background, there are chairs, plants, trees, and a house, all slightly \nblurred.'}\n\n\nMORE_DETAILED_CAPTION\n1. Dogs and Cats\nrun_example(task_prompt='&lt;MORE_DETAILED_CAPTION&gt;',image=dogs_cats)\n[INFO] ==&gt; Florence-2-base (&lt;MORE_DETAILED_CAPTION&gt;), took 49.8 seconds to execute.\n\n{'&lt;MORE_DETAILED_CAPTION&gt;': 'The image shows a group of four cats and a dog in a garden. \nThe garden is filled with colorful flowers and plants, and there is a pathway leading up \nto a house in the background. The main focus of the image is a large German Shepherd dog \nstanding on the left side of the garden, with its tongue hanging out and its mouth open, \nas if it is panting or panting. On the right side, there are two smaller cats, one orange \nand one gray, sitting on the grass. In the background, there is another golden retriever \ndog sitting and looking at the camera. The sky is blue and the sun is shining, creating a \nwarm and inviting atmosphere.'}\n2. Table\nrun_example(task_prompt='&lt; MORE_DETAILED_CAPTION&gt;',image=table)\nINFO] ==&gt; Florence-2-base (&lt;MORE_DETAILED_CAPTION&gt;), took 32.4 seconds to execute.\n\n{'&lt;MORE_DETAILED_CAPTION&gt;': 'The image shows a wooden table with a wooden tray on it. On \nthe tray, there are various fruits such as grapes, oranges, apples, and grapes. There is \nalso a bottle of red wine on the table. The background shows a garden with trees and a \nhouse. The overall mood of the image is peaceful and serene.'}\n\nWe can note that the more detailed the caption task, the longer the latency and the possibility of mistakes (like “The image shows a group of four cats and a dog in a garden”, instead of two dogs and three cats).\n\n\n\nOD - Object Detection\nWe can run the same previous function for object detection using the prompt &lt;OD&gt;.\ntask_prompt = '&lt;OD&gt;'\nresults = run_example(task_prompt,image=dogs_cats)\nprint(results)\nLet’s see the result:\n[INFO] ==&gt; Florence-2-base (&lt;OD&gt;), took 20.9 seconds to execute.\n\n{'&lt;OD&gt;': {'bboxes': [[737.7920532226562, 571.904052734375, 1022.4640502929688, \n980.4800415039062], [0.5120000243186951, 593.4080200195312, 211.4560089111328, \n991.7440185546875], [445.9520263671875, 721.4080200195312, 680.4480590820312, \n850.4320678710938], [39.42400360107422, 91.64800262451172, 491.0080261230469, \n933.3760375976562], [570.8800048828125, 184.83201599121094, 974.3360595703125, \n782.8480224609375]], 'labels': ['cat', 'cat', 'cat', 'dog', 'dog']}}\nOnly by the labels ['cat,' 'cat,' 'cat,' 'dog,' 'dog'] is it possible to see that the main objects in the image were captured. Let’s apply the function used before to draw the bounding boxes:\nplot_bbox(dogs_cats, results['&lt;OD&gt;'])\n\nLet’s also do it with the Table image:\ntask_prompt = '&lt;OD&gt;'\nresults = run_example(task_prompt,image=table)\nplot_bbox(table, results['&lt;OD&gt;'])\n[INFO] ==&gt; Florence-2-base (&lt;OD&gt;), took 40.8 seconds to execute.\n\n\n\nDENSE_REGION_CAPTION\nIt is possible to mix the classic Object Detection with the Caption task in specific sub-regions of the image:\ntask_prompt = '&lt;DENSE_REGION_CAPTION&gt;'\n\nresults = run_example(task_prompt,image=dogs_cats)\nplot_bbox(dogs_cats, results['&lt;DENSE_REGION_CAPTION&gt;'])\n\nresults = run_example(task_prompt,image=table)\nplot_bbox(table, results['&lt;DENSE_REGION_CAPTION&gt;'])\n\n\n\nCAPTION_TO_PHRASE_GROUNDING\nWith this task, we can enter with a caption, such as “a wine glass”, “a wine bottle,” or “a half orange,” and Florence-2 will localize the object in the image:\ntask_prompt = '&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'\n\nresults = run_example(task_prompt, text_input=\"a wine bottle\",image=table)\nplot_bbox(table, results['&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'])\n\nresults = run_example(task_prompt, text_input=\"a wine glass\",image=table)\nplot_bbox(table, results['&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'])\n\nresults = run_example(task_prompt, text_input=\"a half orange\",image=table)\nplot_bbox(table, results['&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'])\n\n[INFO] ==&gt; Florence-2-base (&lt;CAPTION_TO_PHRASE_GROUNDING&gt;), took 15.7 seconds to execute\neach task.\n\n\nCascade Tasks\nWe can also enter the image caption as the input text to push Florence-2 to find more objects:\ntask_prompt = '&lt;CAPTION&gt;'\nresults = run_example(task_prompt,image=dogs_cats)\ntext_input = results[task_prompt]\ntask_prompt = '&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'\nresults = run_example(task_prompt, text_input,image=dogs_cats)\nplot_bbox(dogs_cats, results['&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'])\nChanging the task_prompt among &lt;CAPTION,&gt; &lt;DETAILED_CAPTION&gt; and &lt;MORE_DETAILED_CAPTION&gt;, we will get more objects in the image.\n\n\n\nOPEN_VOCABULARY_DETECTION\n&lt;OPEN_VOCABULARY_DETECTION&gt; allows Florence-2 to detect recognizable objects in an image without relying on a predefined list of categories, making it a versatile tool for identifying various items that may not have been explicitly labeled during training. Unlike &lt;CAPTION_TO_PHRASE_GROUNDING&gt;, which requires a specific text phrase to locate and highlight a particular object in an image, &lt;OPEN_VOCABULARY_DETECTION&gt; performs a broad scan to find and classify all objects present.\nThis makes &lt;OPEN_VOCABULARY_DETECTION&gt; particularly useful for applications where you need a comprehensive overview of everything in an image without prior knowledge of what to expect. Enter with a text describing specific objects not previously detected, resulting in their detection. For example:\ntask_prompt = '&lt;OPEN_VOCABULARY_DETECTION&gt;'\ntext = [\"a house\", \"a tree\", \"a standing cat at the left\", \n        \"a sleeping cat on the ground\", \"a standing cat at the right\", \n        \"a yellow cat\"]\nfor txt in text:\n    results = run_example(task_prompt, text_input=txt,image=dogs_cats)\n    bbox_results  = convert_to_od_format(results['&lt;OPEN_VOCABULARY_DETECTION&gt;'])\n    plot_bbox(dogs_cats, bbox_results)\n\n[INFO] ==&gt; Florence-2-base (&lt;OPEN_VOCABULARY_DETECTION&gt;), took 15.1 seconds to execute \neach task.\n\nNote: Trying to use Florence-2 to find objects that were not found can leads to mistakes (see exaamples on the Notebook).\n\n\n\nReferring expression segmentation\nWe can also segment a specific object in the image and give its description (caption), such as “a wine bottle” on the table image or “a German Sheppard” on the dogs_cats.\nReferring expression segmentation results format: {'&lt;REFERRING_EXPRESSION_SEGMENTATION&gt;': {'Polygons': [[[polygon]], ...], 'labels': ['', '', ...]}}, one object is represented by a list of polygons. each polygon is [x1, y1, x2, y2, ..., xn, yn].\n\nPolygon (x1, y1, …, xn, yn): Location tokens represent the vertices of a polygon in clockwise order.\n\nSo, let’s first create a function to plot the segmentation:\nfrom PIL import Image, ImageDraw, ImageFont\nimport copy\nimport random\nimport numpy as np\ncolormap = ['blue','orange','green','purple','brown','pink','gray','olive',\n    'cyan','red','lime','indigo','violet','aqua','magenta','coral','gold',\n    'tan','skyblue']\n\ndef draw_polygons(image, prediction, fill_mask=False):\n    \"\"\"\n    Draws segmentation masks with polygons on an image.\n\n    Parameters:\n    - image_path: Path to the image file.\n    - prediction: Dictionary containing 'polygons' and 'labels' keys.\n                  'polygons' is a list of lists, each containing vertices \n                  of a polygon.\n                  'labels' is a list of labels corresponding to each polygon.\n    - fill_mask: Boolean indicating whether to fill the polygons with color.\n    \"\"\"\n    # Load the image\n\n    draw = ImageDraw.Draw(image)\n\n\n    # Set up scale factor if needed (use 1 if not scaling)\n    scale = 1\n\n    # Iterate over polygons and labels\n    for polygons, label in zip(prediction['polygons'], prediction['labels']):\n        color = random.choice(colormap)\n        fill_color = random.choice(colormap) if fill_mask else None\n\n        for _polygon in polygons:\n            _polygon = np.array(_polygon).reshape(-1, 2)\n            if len(_polygon) &lt; 3:\n                print('Invalid polygon:', _polygon)\n                continue\n\n            _polygon = (_polygon * scale).reshape(-1).tolist()\n\n            # Draw the polygon\n            if fill_mask:\n                draw.polygon(_polygon, outline=color, fill=fill_color)\n            else:\n                draw.polygon(_polygon, outline=color)\n\n            # Draw the label text\n            draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color)\n    \n    # Save or display the image\n    #image.show()  # Display the image\n    display(image)\nNow we can run the functions:\ntask_prompt = '&lt;REFERRING_EXPRESSION_SEGMENTATION&gt;'\n\nresults = run_example(task_prompt, text_input=\"a wine bottle\",image=table)\noutput_image = copy.deepcopy(table)\ndraw_polygons(output_image, \n              results['&lt;REFERRING_EXPRESSION_SEGMENTATION&gt;'], \n              fill_mask=True)\n\nresults = run_example(task_prompt, text_input=\"a german sheppard\",image=dogs_cats)\noutput_image = copy.deepcopy(dogs_cats)\ndraw_polygons(output_image, \n              results['&lt;REFERRING_EXPRESSION_SEGMENTATION&gt;'], \n              fill_mask=True)\n\n[INFO] ==&gt; Florence-2-base (&lt;REFERRING_EXPRESSION_SEGMENTATION&gt;), took 207.0 seconds to \nexecute each task.\n\n\nRegion to Segmentation\nWith this task, it is also possible to give the object coordinates in the image to segment it. The input format is '&lt;loc_x1&gt;&lt;loc_y1&gt;&lt;loc_x2&gt;&lt;loc_y2&gt;', [x1, y1, x2, y2] , which is the quantized coordinates in [0, 999].\nFor example, when running the code:\ntask_prompt = '&lt;CAPTION_TO_PHRASE_GROUNDING&gt;'\nresults = run_example(task_prompt, text_input=\"a half orange\",image=table)\nresults\nThe results were:\n{'&lt;CAPTION_TO_PHRASE_GROUNDING&gt;': {'bboxes': [[343.552001953125,\n    689.6640625,\n    530.9440307617188,\n    873.9840698242188]],\n  'labels': ['a half']}}\nUsing the bboxes rounded coordinates:\ntask_prompt = '&lt;REGION_TO_SEGMENTATION&gt;'\nresults = run_example(task_prompt, \n                      text_input=\"&lt;loc_343&gt;&lt;loc_690&gt;&lt;loc_531&gt;&lt;loc_874&gt;\",\n                      image=table)\noutput_image = copy.deepcopy(table)\ndraw_polygons(output_image, results['&lt;REGION_TO_SEGMENTATION&gt;'], fill_mask=True)  \nWe got the segmentation of the object on those coordinates (Latency: 83 seconds):\n\n\n\nRegion to Texts\nWe can also give the region (coordinates and ask for a caption):\ntask_prompt = '&lt;REGION_TO_CATEGORY&gt;'\nresults = run_example(task_prompt, text_input=\"&lt;loc_343&gt;&lt;loc_690&gt;&lt;loc_531&gt;\n                      &lt;loc_874&gt;\",image=table)\nresults\n[INFO] ==&gt; Florence-2-base (&lt;REGION_TO_CATEGORY&gt;), took 14.3 seconds to execute.\n{'&lt;REGION_TO_CATEGORY&gt;': 'orange&lt;loc_343&gt;&lt;loc_690&gt;&lt;loc_531&gt;&lt;loc_874&gt;'}\nThe model identified an orange in that region. Let’s ask for a description:\ntask_prompt = '&lt;REGION_TO_DESCRIPTION&gt;'\nresults = run_example(task_prompt, text_input=\"&lt;loc_343&gt;&lt;loc_690&gt;&lt;loc_531&gt;\n                      &lt;loc_874&gt;\",image=table)\nresults\n[INFO] ==&gt; Florence-2-base (&lt;REGION_TO_CATEGORY&gt;), took 14.6 seconds to execute.\n\n{'&lt;REGION_TO_CATEGORY&gt;': 'orange&lt;loc_343&gt;&lt;loc_690&gt;&lt;loc_531&gt;&lt;loc_874&gt;'}\nIn this case, the description did not provide more details, but it could. Try another example.\n\n\nOCR\nWith Florence-2, we can perform Optical Character Recognition (OCR) on an image, getting what is written on it (task_prompt = '&lt;OCR&gt;' and also get the bounding boxes (location) for the detected text (ask_prompt = '&lt;OCR_WITH_REGION&gt;'). Those tasks can help extract and locate textual information in images, such as reading signs, labels, or other forms of text in images.\nLet’s upload a flyer from a talk in Brazil to Raspi. Let’s test works in another language, here Portuguese):\nflayer = Image.open('./images/embarcados.jpg')\n# Display the image\nplt.figure(figsize=(8, 8))\nplt.imshow(flayer)\nplt.axis('off')\n#plt.title(\"Image\")\nplt.show()\n\nLet’s examine the image with '&lt;MORE_DETAILED_CAPTION&gt;' :\n[INFO] ==&gt; Florence-2-base (&lt;MORE_DETAILED_CAPTION&gt;), took 85.2 seconds to execute.\n\n{'&lt;MORE_DETAILED_CAPTION&gt;': 'The image is a promotional poster for an event called \n\"Machine Learning Embarcados\" hosted by Marcelo Roval. The poster has a black background\nwith white text. On the left side of the poster, there is a logo of a coffee cup with the\ntext \"Café Com Embarcados\" above it. Below the logo, it says \"25 de Setembro as 17th\" \nwhich translates to \"25th of September as 17\" in English. \\n\\nOn the right side, there \naretwo smaller text boxes with the names of the participants and their names. The first \ntext box reads \"Democratizando a Inteligência Artificial para Paises em Desenvolvimento\" \nand the second text box says \"Toda quarta-feira\" which is Portuguese for \"Transmissão via \nin Portuguese\".\\n\\nIn the center of the image, there has a photo of Marcelo, a man with a \nbeard and glasses, smiling at the camera. He is wearing a white hard hat and a white \nshirt. The text boxes are in orange and yellow colors.'}\nThe description is very accurate. Let’s get to the more important words with the task OCR:\ntask_prompt = '&lt;OCR&gt;'\nrun_example(task_prompt,image=flayer)\n[INFO] ==&gt; Florence-2-base (&lt;OCR&gt;), took 37.7 seconds to execute.\n{'&lt;OCR&gt;': 'Machine LearningCafécomEmbarcadoEmbarcadosDemocratizando a \nInteligênciaArtificial para Paises em25 de Setembro ás 17hDesenvolvimentoToda quarta-\nfeiraMarcelo RovalProfessor na UNIFIEI eTransmissão viainCo-Director do TinyML4D'}\nLet’s locate the words in the flyer:\ntask_prompt = '&lt;OCR_WITH_REGION&gt;'\nresults = run_example(task_prompt,image=flayer)\nLet’s also create a function to draw bounding boxes around the detected words:\ndef draw_ocr_bboxes(image, prediction):\n    scale = 1\n    draw = ImageDraw.Draw(image)\n    bboxes, labels = prediction['quad_boxes'], prediction['labels']\n    for box, label in zip(bboxes, labels):\n        color = random.choice(colormap)\n        new_box = (np.array(box) * scale).tolist()\n        draw.polygon(new_box, width=3, outline=color)\n        draw.text((new_box[0]+8, new_box[1]+2),\n                    \"{}\".format(label),\n                    align=\"right\",\n\n                    fill=color)\n    display(image)\noutput_image = copy.deepcopy(flayer)\ndraw_ocr_bboxes(output_image, results['&lt;OCR_WITH_REGION&gt;'])\n\nWe can inspect the detected words:\nresults['&lt;OCR_WITH_REGION&gt;']['labels']\n'&lt;/s&gt;Machine Learning',\n 'Café',\n 'com',\n 'Embarcado',\n 'Embarcados',\n 'Democratizando a Inteligência',\n 'Artificial para Paises em',\n '25 de Setembro ás 17h',\n 'Desenvolvimento',\n 'Toda quarta-feira',\n 'Marcelo Roval',\n 'Professor na UNIFIEI e',\n 'Transmissão via',\n 'in',\n 'Co-Director do TinyML4D']",
+    "crumbs": [
+      "Raspberry Pi",
+      "Vision-Language Models (VLM)"
+    ]
+  },
+  {
+    "objectID": "contents/labs/raspi/vlm/vlm.html#latency-summary",
+    "href": "contents/labs/raspi/vlm/vlm.html#latency-summary",
+    "title": "Vision-Language Models (VLM)",
+    "section": "Latency Summary",
+    "text": "Latency Summary\nThe latency observed for different tasks using Florence-2 on the Raspberry Pi (Raspi-5) varied depending on the complexity of the task:\n\nImage Captioning: It took approximately 16-17 seconds to generate a caption for an image.\nDetailed Captioning: Increased latency to around 25-27 seconds, requiring generating more nuanced scene descriptions.\nMore Detailed Captioning: It took about 32-50 seconds, and the latency increased as the description grew more complex.\nObject Detection: It took approximately 20-41 seconds, depending on the image’s complexity and the number of detected objects.\nVisual Grounding: Approximately 15-16 seconds to localize specific objects based on textual prompts.\nOCR (Optical Character Recognition): Extracting text from an image took around 37-38 seconds.\nSegmentation and Region to Segmentation: Segmentation tasks took considerably longer, with a latency of around 83-207 seconds, depending on the complexity and the number of regions to be segmented.\n\nThese latency times highlight the resource constraints of edge devices like the Raspberry Pi and emphasize the need to optimize the model and the environment to achieve real-time performance.\n\n\nRunning complex tasks can use all 8GB of the Raspi-5’s memory. For example, the above screenshot during the Florence OD task shows 4 CPUs at full speed and over 5GB of memory in use. Consider increasing the SWAP memory to 2 GB.\n\nChecking the CPU temperature with vcgencmd measure_temp , showed that temperature can go up to +80oC.",
+    "crumbs": [
+      "Raspberry Pi",
+      "Vision-Language Models (VLM)"
+    ]
+  },
+  {
+    "objectID": "contents/labs/raspi/vlm/vlm.html#fine-tunning",
+    "href": "contents/labs/raspi/vlm/vlm.html#fine-tunning",
+    "title": "Vision-Language Models (VLM)",
+    "section": "Fine-Tunning",
+    "text": "Fine-Tunning\nAs explored in this lab, Florence supports many tasks out of the box, including captioning, object detection, OCR, and more. However, like other pre-trained foundational models, Florence-2 may need domain-specific knowledge. For example, it may need to improve with medical or satellite imagery. In such cases, fine-tuning with a custom dataset is necessary. The Roboflow tutorial, How to Fine-tune Florence-2 for Object Detection Tasks, shows how to fine-tune Florence-2 on object detection datasets to improve model performance for our specific use case.\nBased on the above tutorial, it is possible to fine-tune the Florence-2 model to detect boxes and wheels used in previous labs:\n\nIt is important to note that after fine-tuning, the model can still detect classes that don’t belong to our custom dataset, like cats, dogs, grapes, etc, as seen before).\nThe complete fine-tunning project using a previously annotated dataset in Roboflow and executed on CoLab can be found in the notebook:\n\n30-Finetune_florence_2_on_detection_dataset_box_vs_wheel.ipynb\n\nIn another example, in the post, Fine-tuning Florence-2 - Microsoft’s Cutting-edge Vision Language Models, the authors show an example of fine-tuning Florence on DocVQA. The authors report that Florence 2 can perform visual question answering (VQA), but the released models don’t include VQA capability.",
+    "crumbs": [
+      "Raspberry Pi",
+      "Vision-Language Models (VLM)"
+    ]
+  },
+  {
+    "objectID": "contents/labs/raspi/vlm/vlm.html#conclusion",
+    "href": "contents/labs/raspi/vlm/vlm.html#conclusion",
+    "title": "Vision-Language Models (VLM)",
+    "section": "Conclusion",
+    "text": "Conclusion\nFlorence-2 offers a versatile and powerful approach to vision-language tasks at the edge, providing performance that rivals larger, task-specific models, such as YOLO for object detection, BERT/RoBERTa for text analysis, and specialized OCR models.\nThanks to its multi-modal transformer architecture, Florence-2 is more flexible than YOLO in terms of the tasks it can handle. These include object detection, image captioning, and visual grounding.\nUnlike BERT, which focuses purely on language, Florence-2 integrates vision and language, allowing it to excel in applications that require both modalities, such as image captioning and visual grounding.\nMoreover, while traditional OCR models such as Tesseract and EasyOCR are designed solely for recognizing and extracting text from images, Florence-2’s OCR capabilities are part of a broader framework that includes contextual understanding and visual-text alignment. This makes it particularly useful for scenarios that require both reading text and interpreting its context within images.\nOverall, Florence-2 stands out for its ability to seamlessly integrate various vision-language tasks into a unified model that is efficient enough to run on edge devices like the Raspberry Pi. This makes it a compelling choice for developers and researchers exploring AI applications at the edge.\n\nKey Advantages of Florence-2\n\nUnified Architecture\n\nSingle model handles multiple vision tasks vs. specialized models (YOLO, BERT, Tesseract)\nEliminates the need for multiple model deployments and integrations\nConsistent API and interface across tasks\n\nPerformance Comparison\n\nObject Detection: Comparable to YOLOv8 (~37.5 mAP on COCO vs. YOLOv8’s ~39.7 mAP) despite being general-purpose\nText Recognition: Handles multiple languages effectively like specialized OCR models (Tesseract, EasyOCR)\nLanguage Understanding: Integrates BERT-like capabilities for text processing while adding visual context\n\nResource Efficiency\n\nThe Base model (232M parameters) achieves strong results despite smaller size\nRuns effectively on edge devices (Raspberry Pi)\nSingle model deployment vs. multiple specialized models\n\n\n\n\nTrade-offs\n\nPerformance vs. Specialized Models\n\nYOLO series may offer faster inference for pure object detection\nSpecialized OCR models might handle complex document layouts better\nBERT/RoBERTa provide deeper language understanding for text-only tasks\n\nResource Requirements\n\nHigher latency on edge devices (15-200s depending on task)\nRequires careful memory management on Raspberry Pi\nIt may need optimization for real-time applications\n\nDeployment Considerations\n\nInitial setup is more complex than single-purpose models\nRequires understanding of multiple task types and prompts\nThe learning curve for optimal prompt engineering\n\n\n\n\nBest Use Cases\n\nResource-Constrained Environments\n\nEdge devices requiring multiple vision capabilities\nSystems with limited storage/deployment capacity\nApplications needing flexible vision processing\n\nMulti-modal Applications\n\nContent moderation systems\nAccessibility tools\nDocument analysis workflows\n\nRapid Prototyping\n\nQuick deployment of vision capabilities\nTesting multiple vision tasks without separate models\nProof-of-concept development",
+    "crumbs": [
+      "Raspberry Pi",
+      "Vision-Language Models (VLM)"
+    ]
+  },
+  {
+    "objectID": "contents/labs/raspi/vlm/vlm.html#future-implications",
+    "href": "contents/labs/raspi/vlm/vlm.html#future-implications",
+    "title": "Vision-Language Models (VLM)",
+    "section": "Future Implications",
+    "text": "Future Implications\nFlorence-2 represents a shift toward unified vision models that could eventually replace task-specific architectures in many applications. While specialized models maintain advantages in specific scenarios, the convenience and efficiency of unified models like Florence-2 make them increasingly attractive for real-world deployments.\nThe lab demonstrates Florence-2’s viability on edge devices, suggesting future IoT, mobile computing, and embedded systems applications where deploying multiple specialized models would be impractical.",
+    "crumbs": [
+      "Raspberry Pi",
+      "Vision-Language Models (VLM)"
+    ]
+  },
+  {
+    "objectID": "contents/labs/raspi/vlm/vlm.html#resources",
+    "href": "contents/labs/raspi/vlm/vlm.html#resources",
+    "title": "Vision-Language Models (VLM)",
+    "section": "Resources",
+    "text": "Resources\n\n10-florence2_test.ipynb\n20-florence_2.ipynb\n30-Finetune_florence_2_on_detection_dataset_box_vs_wheel.ipynb",
+    "crumbs": [
+      "Raspberry Pi",
+      "Vision-Language Models (VLM)"
+    ]
+  },
   {
     "objectID": "contents/labs/shared/shared.html",
     "href": "contents/labs/shared/shared.html",