From f5aa988bafcf05e614a20ae414a77615be667688 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Mon, 23 Sep 2024 21:06:38 +0400
Subject: [PATCH] SIMD ColorLUT. Fix color overflow

# Conflicts:
#	src/PIL/_version.py
---
 CHANGES.SIMD.rst          | 6 +++++-
 src/PIL/_version.py       | 2 +-
 src/_imaging.c            | 6 ++++--
 src/libImaging/ColorLUT.c | 8 ++++----
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/CHANGES.SIMD.rst b/CHANGES.SIMD.rst
index 033b16dc652..5c12e770d9b 100644
--- a/CHANGES.SIMD.rst
+++ b/CHANGES.SIMD.rst
@@ -1,13 +1,17 @@
 Changelog (Pillow-SIMD)
 =======================
 
+9.5.0.post2 & 9.4.0.post2 & 9.3.0.post2 & 9.2.0.post2 & 9.1.1.post2 & 9.0.0.post3
+---------------------------------------------------------------------------------
+
+- Fixed color ovrflow in LUT processing
+
 9.0.0.post1
 -----------
 
 - Fixed possible overflow in LUT processing
 - Restored compatibility with Visual C Compiler
 
-
 7.0.0.post4
 -----------
 
diff --git a/src/PIL/_version.py b/src/PIL/_version.py
index bf696c6eb85..e39026f3b2a 100644
--- a/src/PIL/_version.py
+++ b/src/PIL/_version.py
@@ -1,2 +1,2 @@
 # Master version for Pillow
-__version__ = "9.4.0.post1"
+__version__ = "9.4.0.post2"
diff --git a/src/_imaging.c b/src/_imaging.c
index 6f90505ed7e..4cd396409ee 100644
--- a/src/_imaging.c
+++ b/src/_imaging.c
@@ -723,6 +723,7 @@ _prepare_lut_table(PyObject *table, Py_ssize_t table_size) {
 
 /* NOTE: This value should be the same as in ColorLUT.c */
 #define PRECISION_BITS (16 - 8 - 2)
+#define PRECISION_ROUNDING (1 << (PRECISION_BITS - 1))
 
     const char *wrong_size =
         ("The table should have table_channels * "
@@ -787,8 +788,8 @@ _prepare_lut_table(PyObject *table, Py_ssize_t table_size) {
                 break;
         }
         /* Max value for INT16 */
-        if (item >= (0x7fff - 0.5) / (255 << PRECISION_BITS)) {
-            prepared[i] = 0x7fff;
+        if (item >= (0x7fff - 0.5 - PRECISION_ROUNDING) / (255 << PRECISION_BITS)) {
+            prepared[i] = 0x7fff - PRECISION_ROUNDING;
             continue;
         }
         /* Min value for INT16 */
@@ -804,6 +805,7 @@ _prepare_lut_table(PyObject *table, Py_ssize_t table_size) {
     }
 
 #undef PRECISION_BITS
+#undef PRECISION_ROUNDING
     if (free_table_data) {
         free(table_data);
     }
diff --git a/src/libImaging/ColorLUT.c b/src/libImaging/ColorLUT.c
index a9887dc58a2..167be0b0742 100644
--- a/src/libImaging/ColorLUT.c
+++ b/src/libImaging/ColorLUT.c
@@ -233,7 +233,7 @@ ImagingColorLUT3D_linear(
             __m128i source = _mm_loadu_si128((__m128i *) &rowIn[x]);
             // scale up to 16 bits, but scale * 255 * 256 up to 31 bits
             // bi, gi and ri - 6 bits index
-            // rs, rs and rs - 9 bits shift
+            // bs, gs and rs - 9 bits shift
             // 00 bi3.bs3 gi3.gs3 ri3.rs3 00 bi2.bs2 gi2.gs2 ri2.rs2
             // 00 bi1.bs1 gi1.gs1 ri1.rs1 00 bi0.bs0 gi0.gs0 ri0.rs0
             __m256i index = _mm256_mulhi_epu16(scale256,
@@ -248,7 +248,7 @@ ImagingColorLUT3D_linear(
                 __m128i next_source = _mm_loadu_si128((__m128i *) &rowIn[x + 4]);
                 // scale up to 16 bits, but scale * 255 * 256 up to 31 bits
                 // bi, gi and ri - 6 bits index
-                // rs, rs and rs - 9 bits shift
+                // bs, gs and rs - 9 bits shift
                 // 00 bi3.bs3 gi3.gs3 ri3.rs3 00 bi2.bs2 gi2.gs2 ri2.rs2
                 // 00 bi1.bs1 gi1.gs1 ri1.rs1 00 bi0.bs0 gi0.gs0 ri0.rs0
                 __m256i next_index = _mm256_mulhi_epu16(scale256,
@@ -332,7 +332,7 @@ ImagingColorLUT3D_linear(
             __m128i source = _mm_loadl_epi64((__m128i *) &rowIn[x]);
             // scale up to 16 bits, but scale * 255 * 256 up to 31 bits
             // bi, gi and ri - 6 bits index
-            // rs, rs and rs - 9 bits shift
+            // bs, gs and rs - 9 bits shift
             // 00 bi1.bs1 gi1.gs1 ri1.rs1 00 bi0.bs0 gi0.gs0 ri0.rs0
             __m128i index = _mm_mulhi_epu16(scale,
                 _mm_unpacklo_epi8(_mm_setzero_si128(), source));
@@ -402,7 +402,7 @@ ImagingColorLUT3D_linear(
             __m128i source = _mm_cvtsi32_si128(rowIn[x]);
             // scale up to 16 bits, but scale * 255 * 256 up to 31 bits
             // bi, gi and ri - 6 bits index
-            // rs, rs and rs - 9 bits shift
+            // bs, gs and rs - 9 bits shift
             // 00 00 00 00 00 bi.bs gi.gs ri.rs
             __m128i index = _mm_mulhi_epu16(scale,
                 _mm_unpacklo_epi8(_mm_setzero_si128(), source));