Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow using SIMD intrinsics for SSE2, SSE4, AVX2 and NEON code #8209

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
13 changes: 11 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,19 @@ jobs:
include:
- { python-version: "3.11", PYTHONOPTIMIZE: 1, REVERSE: "--reverse" }
- { python-version: "3.10", PYTHONOPTIMIZE: 2 }
# SIMD-accelerated builds for x86
- { os: "ubuntu-latest", python-version: "3.9", acceleration: "sse4"}
- { os: "ubuntu-latest", python-version: "3.12", acceleration: "avx2"}
# Free-threaded
- { os: "ubuntu-latest", python-version: "3.13-dev", disable-gil: true }
# M1 only available for 3.10+
- { os: "macos-13", python-version: "3.9" }
- { os: "macos-13", python-version: "3.9", acceleration: "avx2"}
exclude:
- { os: "macos-14", python-version: "3.9" }

runs-on: ${{ matrix.os }}
name: ${{ matrix.os }} Python ${{ matrix.python-version }} ${{ matrix.disable-gil && 'free-threaded' || '' }}
name: ${{ matrix.os }} Python ${{ matrix.python-version }} ${{ matrix.acceleration }} ${{ matrix.disable-gil && 'free-threaded' || '' }}

steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -108,7 +112,7 @@ jobs:
GHA_LIBIMAGEQUANT_CACHE_HIT: ${{ steps.cache-libimagequant.outputs.cache-hit }}

- name: Install macOS dependencies
if: startsWith(matrix.os, 'macOS')
if: startsWith(matrix.os, 'macos')
run: |
.github/workflows/macos-install.sh
env:
Expand All @@ -118,6 +122,11 @@ jobs:
if: "matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12'"
run: echo "::add-matcher::.github/problem-matchers/gcc.json"

- name: Set compiler options for optimization
if: ${{ matrix.acceleration }}
run: |
echo "CC=cc -m${{ matrix.acceleration }}" >> $GITHUB_ENV

- name: Build
run: |
.ci/build.sh
Expand Down
4 changes: 3 additions & 1 deletion Tests/test_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def test(name: str, function: Callable[[str], str | None]) -> None:
assert version is None
else:
assert function(name) == version
if name != "PIL":
if name == "acceleration":
assert version in ("avx2", "sse4", "sse2", "neon", None)
elif name != "PIL":
if name == "zlib" and version is not None:
version = re.sub(".zlib-ng$", "", version)
elif name == "libtiff" and version is not None:
Expand Down
4 changes: 3 additions & 1 deletion src/PIL/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def get_supported_codecs() -> list[str]:
"libjpeg_turbo": ("PIL._imaging", "HAVE_LIBJPEGTURBO", "libjpeg_turbo_version"),
"libimagequant": ("PIL._imaging", "HAVE_LIBIMAGEQUANT", "imagequant_version"),
"xcb": ("PIL._imaging", "HAVE_XCB", None),
"acceleration": ("PIL._imaging", "HAVE_ACCELERATION", "acceleration"),
}


Expand Down Expand Up @@ -267,6 +268,7 @@ def pilinfo(out: IO[str] | None = None, supported_formats: bool = True) -> None:

for name, feature in [
("pil", "PIL CORE"),
("acceleration", "Acceleration"),
("tkinter", "TKINTER"),
("freetype2", "FREETYPE2"),
("littlecms2", "LITTLECMS2"),
Expand All @@ -291,7 +293,7 @@ def pilinfo(out: IO[str] | None = None, supported_formats: bool = True) -> None:
if v is None:
v = version(name)
if v is not None:
version_static = name in ("pil", "jpg")
version_static = name in ("pil", "jpg", "acceleration")
if name == "littlecms2":
# this check is also in src/_imagingcms.c:setup_module()
version_static = tuple(int(x) for x in v.split(".")) < (2, 7)
Expand Down
18 changes: 18 additions & 0 deletions src/_imaging.c
Original file line number Diff line number Diff line change
Expand Up @@ -4407,6 +4407,24 @@ setup_module(PyObject *m) {
Py_INCREF(have_xcb);
PyModule_AddObject(m, "HAVE_XCB", have_xcb);

PyObject *have_acceleration = Py_True;
#ifdef __AVX2__
PyModule_AddStringConstant(m, "acceleration", "avx2");
#elif defined(__SSE4__)
PyModule_AddStringConstant(m, "acceleration", "sse4");
#elif defined(__SSE2__)
PyModule_AddStringConstant(m, "acceleration", "sse2");
#elif defined(__NEON__)
PyModule_AddStringConstant(m, "acceleration", "neon");
#else
Py_INCREF(Py_None);
PyModule_AddObject(m, "acceleration", Py_None);

have_acceleration = Py_False;
#endif
Py_INCREF(have_acceleration);
PyModule_AddObject(m, "HAVE_ACCELERATION", have_acceleration);

PyObject *pillow_version = PyUnicode_FromString(version);
PyDict_SetItemString(
d, "PILLOW_VERSION", pillow_version ? pillow_version : Py_None
Expand Down
63 changes: 63 additions & 0 deletions src/libImaging/Bands.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ Imaging
ImagingGetBand(Imaging imIn, int band) {
Imaging imOut;
int x, y;
#ifdef __SSE4__
__m128i shuffle_mask;
#endif

/* Check arguments */
if (!imIn || imIn->type != IMAGING_TYPE_UINT8) {
Expand All @@ -46,14 +49,41 @@ ImagingGetBand(Imaging imIn, int band) {
return NULL;
}

#ifdef __SSE4__
shuffle_mask = _mm_set_epi8(
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
12 + band,
8 + band,
4 + band,
0 + band
);
#endif

/* Extract band from image */
for (y = 0; y < imIn->ysize; y++) {
UINT8 *in = (UINT8 *)imIn->image[y] + band;
UINT8 *out = imOut->image8[y];
x = 0;
for (; x < imIn->xsize - 3; x += 4) {
#ifdef __SSE4__
__m128i source = _mm_loadu_si128((__m128i *)(in - band));
*((UINT32 *)(out + x)) =
_mm_cvtsi128_si32(_mm_shuffle_epi8(source, shuffle_mask));
#else
UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
memcpy(out + x, &v, sizeof(v));
#endif
in += 16;
}
for (; x < imIn->xsize; x++) {
Expand Down Expand Up @@ -99,10 +129,20 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
UINT8 *out1 = bands[1]->image8[y];
x = 0;
for (; x < imIn->xsize - 3; x += 4) {
#ifdef __SSE4__
__m128i source = _mm_loadu_si128((__m128i *)in);
source = _mm_shuffle_epi8(
source,
_mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)
);
*((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source);
*((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 12));
#else
UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
memcpy(out0 + x, &v, sizeof(v));
v = MAKE_UINT32(in[0 + 3], in[4 + 3], in[8 + 3], in[12 + 3]);
memcpy(out1 + x, &v, sizeof(v));
#endif
in += 16;
}
for (; x < imIn->xsize; x++) {
Expand All @@ -119,12 +159,23 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
UINT8 *out2 = bands[2]->image8[y];
x = 0;
for (; x < imIn->xsize - 3; x += 4) {
#ifdef __SSE4__
__m128i source = _mm_loadu_si128((__m128i *)in);
source = _mm_shuffle_epi8(
source,
_mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)
);
*((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source);
*((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4));
*((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8));
#else
UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
memcpy(out0 + x, &v, sizeof(v));
v = MAKE_UINT32(in[0 + 1], in[4 + 1], in[8 + 1], in[12 + 1]);
memcpy(out1 + x, &v, sizeof(v));
v = MAKE_UINT32(in[0 + 2], in[4 + 2], in[8 + 2], in[12 + 2]);
memcpy(out2 + x, &v, sizeof(v));
#endif
in += 16;
}
for (; x < imIn->xsize; x++) {
Expand All @@ -143,6 +194,17 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
UINT8 *out3 = bands[3]->image8[y];
x = 0;
for (; x < imIn->xsize - 3; x += 4) {
#ifdef __SSE4__
__m128i source = _mm_loadu_si128((__m128i *)in);
source = _mm_shuffle_epi8(
source,
_mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)
);
*((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source);
*((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4));
*((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8));
*((UINT32 *)(out3 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 12));
#else
UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
memcpy(out0 + x, &v, sizeof(v));
v = MAKE_UINT32(in[0 + 1], in[4 + 1], in[8 + 1], in[12 + 1]);
Expand All @@ -151,6 +213,7 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
memcpy(out2 + x, &v, sizeof(v));
v = MAKE_UINT32(in[0 + 3], in[4 + 3], in[8 + 3], in[12 + 3]);
memcpy(out3 + x, &v, sizeof(v));
#endif
in += 16;
}
for (; x < imIn->xsize; x++) {
Expand Down
2 changes: 2 additions & 0 deletions src/libImaging/ImPlatform.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,5 @@ typedef signed __int64 int64_t;
#ifdef __GNUC__
#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
#endif

#include "ImagingSIMD.h"
49 changes: 49 additions & 0 deletions src/libImaging/ImagingSIMD.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/* Microsoft compiler doesn't limit intrinsics for an architecture.
This macro is set only on x86 and means SSE2 and above including AVX2. */
#if defined(_M_X64) || _M_IX86_FP == 2
#define __SSE2__
/* However, Microsoft compiler set __AVX2__ if /arch:AVX2 option is set */
#ifdef __AVX2__
#define __SSE4_2__
#endif
#endif

/* For better readability */
#ifdef __SSE4_2__
#define __SSE4__
#endif
#ifdef __aarch64__
#define __NEON__
#endif

#ifdef __SSE2__
#include <mmintrin.h> // MMX
#include <xmmintrin.h> // SSE
#include <emmintrin.h> // SSE2
#endif
#ifdef __SSE4__
#include <pmmintrin.h> // SSE3
#include <tmmintrin.h> // SSSE3
#include <smmintrin.h> // SSE4.1
#include <nmmintrin.h> // SSE4.2
#endif
#ifdef __AVX2__
#include <immintrin.h> // AVX, AVX2
#endif
#ifdef __NEON__
#include <arm_neon.h> // ARM NEON
#endif

#ifdef __SSE4__
static inline __m128i
mm_cvtepu8_epi32(void *ptr) {
return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(INT32 *)ptr));
}
#endif

#ifdef __AVX2__
static inline __m256i
mm256_cvtepu8_epi32(void *ptr) {
return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)ptr));
}
#endif
Loading