From bbc7bdf23e7b85ce46381477e9240c1331d9cf43 Mon Sep 17 00:00:00 2001 From: Luca Versari Date: Sun, 10 Mar 2024 13:40:26 +0100 Subject: [PATCH] Allow BGR(A) input. --- fpnge.cc | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- fpnge.h | 7 ++++ 2 files changed, 131 insertions(+), 2 deletions(-) diff --git a/fpnge.cc b/fpnge.cc index 200d0a6..d043986 100644 --- a/fpnge.cc +++ b/fpnge.cc @@ -1461,6 +1461,126 @@ static void WriteHeader(size_t width, size_t height, size_t bytes_per_channel, } } +void CopyRow(unsigned char *dst, const unsigned char *src, size_t nb_channels, + size_t bytes_per_channel, FPNGEColorChannelOrder order, + size_t width) { + if (order == FPNGE_ORDER_RGB || nb_channels <= 2) { + memcpy(dst, src, nb_channels * bytes_per_channel * width); + return; + } + size_t x = 0; + if (nb_channels == 4 && bytes_per_channel == 1) { + for (; x + SIMD_WIDTH / 4 <= width; x += SIMD_WIDTH / 4) { + auto vec = MMSI(loadu)((MIVEC *)(src + x * 4)); + auto shuf = MM(shuffle_epi8)( + vec, BCAST128(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, + 13, 12, 15))); + MMSI(storeu)((MIVEC *)(dst + x * 4), shuf); + } + } else if (nb_channels == 4 && bytes_per_channel == 2) { + for (; x + SIMD_WIDTH / 8 <= width; x += SIMD_WIDTH / 8) { + auto vec = MMSI(loadu)((MIVEC *)(src + x * 8)); + auto shuf = MM(shuffle_epi8)( + vec, BCAST128(_mm_setr_epi8(4, 5, 2, 3, 0, 1, 6, 7, 12, 13, 10, 11, 8, + 9, 14, 15))); + MMSI(storeu)((MIVEC *)(dst + x * 8), shuf); + } + } else if (nb_channels == 3 && bytes_per_channel == 1) { + for (; x + 16 <= width; x += 16) { + auto vec1 = _mm_loadu_si128((__m128i *)(src + x * 3)); + auto vec2 = _mm_loadu_si128((__m128i *)(src + x * 3 + 16)); + auto vec3 = _mm_loadu_si128((__m128i *)(src + x * 3 + 32)); + auto s1a = + _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, -1); + auto s1b = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, 1); + auto shuf1 = _mm_or_si128(_mm_shuffle_epi8(vec1, s1a), + _mm_shuffle_epi8(vec2, s1b)); + auto s2a = _mm_setr_epi8(-1, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1); + auto s2b = + _mm_setr_epi8(0, -1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, -1, 15); + auto s2c = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 0, -1); + auto shuf2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(vec1, s2a), + _mm_shuffle_epi8(vec2, s2b)), + _mm_shuffle_epi8(vec3, s2c)); + auto s3b = _mm_setr_epi8(14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1); + auto s3c = + _mm_setr_epi8(-1, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13); + auto shuf3 = _mm_or_si128(_mm_shuffle_epi8(vec2, s3b), + _mm_shuffle_epi8(vec3, s3c)); + + _mm_storeu_si128((__m128i *)(dst + x * 3), shuf1); + _mm_storeu_si128((__m128i *)(dst + x * 3 + 16), shuf2); + _mm_storeu_si128((__m128i *)(dst + x * 3 + 32), shuf3); + } + } else if (nb_channels == 3 && bytes_per_channel == 2) { + for (; x + 8 <= width; x += 8) { + auto vec1 = _mm_loadu_si128((__m128i *)(src + x * 6)); + auto vec2 = _mm_loadu_si128((__m128i *)(src + x * 6 + 16)); + auto vec3 = _mm_loadu_si128((__m128i *)(src + x * 6 + 32)); + auto s1a = + _mm_setr_epi8(4, 5, 2, 3, 0, 1, 10, 11, 8, 9, 6, 7, -1, -1, 14, 15); + auto s1b = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, -1, -1); + auto shuf1 = _mm_or_si128(_mm_shuffle_epi8(vec1, s1a), + _mm_shuffle_epi8(vec2, s1b)); + auto s2a = _mm_setr_epi8(12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1); + auto s2b = + _mm_setr_epi8(-1, -1, 6, 7, 4, 5, 2, 3, 12, 13, 10, 11, 8, 9, -1, -1); + auto s2c = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 2, 3); + auto shuf2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(vec1, s2a), + _mm_shuffle_epi8(vec2, s2b)), + _mm_shuffle_epi8(vec3, s2c)); + auto s3b = _mm_setr_epi8(-1, -1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1); + auto s3c = + _mm_setr_epi8(0, 1, -1, -1, 8, 9, 6, 7, 4, 5, 14, 15, 12, 13, 10, 11); + auto shuf3 = _mm_or_si128(_mm_shuffle_epi8(vec2, s3b), + _mm_shuffle_epi8(vec3, s3c)); + + _mm_storeu_si128((__m128i *)(dst + x * 6), shuf1); + _mm_storeu_si128((__m128i *)(dst + x * 6 + 16), shuf2); + _mm_storeu_si128((__m128i *)(dst + x * 6 + 32), shuf3); + } + } + for (; x < width; x++) { + if (nb_channels == 3 && bytes_per_channel == 1) { + dst[x * 3] = src[x * 3 + 2]; + dst[x * 3 + 1] = src[x * 3 + 1]; + dst[x * 3 + 2] = src[x * 3]; + } + if (nb_channels == 3 && bytes_per_channel == 2) { + dst[x * 6] = src[x * 6 + 4]; + dst[x * 6 + 1] = src[x * 6 + 3]; + dst[x * 6 + 2] = src[x * 6 + 2]; + dst[x * 6 + 3] = src[x * 6 + 3]; + dst[x * 6 + 4] = src[x * 6]; + dst[x * 6 + 5] = src[x * 6 + 1]; + } + if (nb_channels == 4 && bytes_per_channel == 1) { + dst[x * 4] = src[x * 4 + 2]; + dst[x * 4 + 1] = src[x * 4 + 1]; + dst[x * 4 + 2] = src[x * 4]; + dst[x * 4 + 3] = src[x * 4 + 3]; + } + if (nb_channels == 4 && bytes_per_channel == 2) { + dst[x * 8] = src[x * 8 + 4]; + dst[x * 8 + 1] = src[x * 8 + 3]; + dst[x * 8 + 2] = src[x * 8 + 2]; + dst[x * 8 + 3] = src[x * 8 + 3]; + dst[x * 8 + 4] = src[x * 8]; + dst[x * 8 + 5] = src[x * 8 + 1]; + dst[x * 8 + 6] = src[x * 8 + 6]; + dst[x * 8 + 7] = src[x * 8 + 7]; + } + } +} + } // namespace extern "C" size_t FPNGEEncode(size_t bytes_per_channel, size_t num_channels, @@ -1541,7 +1661,8 @@ extern "C" size_t FPNGEEncode(size_t bytes_per_channel, size_t num_channels, const unsigned char *topleft_buf = top_buf - bytes_per_channel * num_channels; - memcpy(current_row_buf, current_row_in, bytes_per_line); + CopyRow(current_row_buf, current_row_in, num_channels, bytes_per_channel, + (FPNGEColorChannelOrder)options->channel_order, width); if (y == y0 && y != 0) { continue; } @@ -1573,7 +1694,8 @@ extern "C" size_t FPNGEEncode(size_t bytes_per_channel, size_t num_channels, const unsigned char *topleft_buf = top_buf - bytes_per_channel * num_channels; - memcpy(current_row_buf, current_row_in, bytes_per_line); + CopyRow(current_row_buf, current_row_in, num_channels, bytes_per_channel, + (FPNGEColorChannelOrder)options->channel_order, width); EncodeOneRow(bytes_per_line, current_row_buf, top_buf, left_buf, topleft_buf, aligned_pdata_ptr, huffman_table, s1, s2, &writer, diff --git a/fpnge.h b/fpnge.h index fe94a4c..77db8a7 100644 --- a/fpnge.h +++ b/fpnge.h @@ -31,6 +31,11 @@ enum FPNGEOptionsPredictor { FPNGE_PREDICTOR_BEST }; +enum FPNGEColorChannelOrder { + FPNGE_ORDER_RGB, + FPNGE_ORDER_BGR, +}; + struct FPNGEAdditionalChunk { char name[4]; const char *data; @@ -41,6 +46,7 @@ struct FPNGEOptions { char predictor; // FPNGEOptionsPredictor char huffman_sample; // 0-127: how much of the image to sample char cicp_colorspace; // FPNGECicpColorspace + char channel_order; // FPNGEColorChannelOrder int num_additional_chunks; const struct FPNGEAdditionalChunk *additional_chunks; }; @@ -55,6 +61,7 @@ inline void FPNGEFillOptions(struct FPNGEOptions *options, int level, options->huffman_sample = 1; options->num_additional_chunks = 0; options->additional_chunks = NULL; + options->channel_order = FPNGE_ORDER_RGB; switch (level) { case 1: options->predictor = 2;