Skip to content

Commit

Permalink
Fix concatenating image patches for LLaVA Next models (#1138)
Browse files Browse the repository at this point in the history
  • Loading branch information
yatarkan authored Nov 4, 2024
1 parent 91aea4c commit 7954d82
Showing 1 changed file with 3 additions and 11 deletions.
14 changes: 3 additions & 11 deletions src/cpp/src/visual_language/vision_encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -484,18 +484,10 @@ ov::Tensor get_pixel_values_llava_next(const ov::Tensor& image, const ProcessorC
ov::Tensor concatenated_tensor(ov::element::f32, {num_patches, channels, height, width});
float* tensor_data = concatenated_tensor.data<float>();

// Fill the tensor with the preprocessed patch data
// Fill the tensor with the preprocessed patches data (each patch layout is [C * H * W])
for (size_t i = 0; i < num_patches; ++i) {
const auto& patch = processed_patches[i];
for (size_t c = 0; c < channels; ++c) {
for (size_t h = 0; h < height; ++h) {
for (size_t w = 0; w < width; ++w) {
size_t tensor_index = i * channels * height * width + c * height * width + h * width + w;
size_t patch_index = (h * width + w) * channels + c;
tensor_data[tensor_index] = patch.buf[patch_index];
}
}
}
const auto& img = processed_patches[i];
std::copy(img.buf.begin(), img.buf.end(), tensor_data + i * channels * height * width);
}

return concatenated_tensor;
Expand Down

0 comments on commit 7954d82

Please sign in to comment.