Fix concatenating image patches for LLaVA Next models (#1138)

openvinotoolkit · Nov 4, 2024 · 7954d82 · 7954d82
1 parent 91aea4c
commit 7954d82
Showing 1 changed file with 3 additions and 11 deletions.
diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -484,18 +484,10 @@ ov::Tensor get_pixel_values_llava_next(const ov::Tensor& image, const ProcessorC
     ov::Tensor concatenated_tensor(ov::element::f32, {num_patches, channels, height, width});
     float* tensor_data = concatenated_tensor.data<float>();
 
-    // Fill the tensor with the preprocessed patch data
+    // Fill the tensor with the preprocessed patches data (each patch layout is [C * H * W])
     for (size_t i = 0; i < num_patches; ++i) {
-        const auto& patch = processed_patches[i];
-        for (size_t c = 0; c < channels; ++c) {
-            for (size_t h = 0; h < height; ++h) {
-                for (size_t w = 0; w < width; ++w) {
-                    size_t tensor_index = i * channels * height * width + c * height * width + h * width + w;
-                    size_t patch_index = (h * width + w) * channels + c;
-                    tensor_data[tensor_index] = patch.buf[patch_index];
-                }
-            }
-        }
+        const auto& img = processed_patches[i];
+        std::copy(img.buf.begin(), img.buf.end(), tensor_data + i * channels * height * width);
     }
 
     return concatenated_tensor;