From 9c138a93417561e8516db8f856b12481edca55ad Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 25 Sep 2024 10:56:22 +0400 Subject: [PATCH 01/14] Define math consts --- C/common.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/C/common.h b/C/common.h index ce581442..029b9bee 100644 --- a/C/common.h +++ b/C/common.h @@ -1,11 +1,8 @@ #ifndef __BLURHASH_COMMON_H__ #define __BLURHASH_COMMON_H__ -#include - -#ifndef M_PI -#define M_PI 3.14159265358979323846 -#endif +#define _USE_MATH_DEFINES +#include static inline int linearTosRGB(float value) { float v = fmaxf(0, fminf(1, value)); From 3152f570cc586be07145485bf6ed1ffbd13fefda Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 25 Sep 2024 12:28:42 +0400 Subject: [PATCH 02/14] Fix number of arguments in blurhash_encoder --- C/encode_stb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/C/encode_stb.c b/C/encode_stb.c index cd3e461a..811ca000 100644 --- a/C/encode_stb.c +++ b/C/encode_stb.c @@ -15,8 +15,8 @@ int main(int argc, const char **argv) { int xComponents = atoi(argv[1]); int yComponents = atoi(argv[2]); - if(xComponents < 1 || xComponents > 8 || yComponents < 1 || yComponents > 8) { - fprintf(stderr, "Component counts must be between 1 and 8.\n"); + if(xComponents < 1 || xComponents > 9 || yComponents < 1 || yComponents > 9) { + fprintf(stderr, "Component counts must be between 1 and 9.\n"); return 1; } From b027b16e258d38468ceadb58779eb2a087714e78 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 25 Sep 2024 11:04:00 +0400 Subject: [PATCH 03/14] Show main warnings like unused variables --- C/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/C/Makefile b/C/Makefile index 33dddaa4..d8d0580e 100644 --- a/C/Makefile +++ b/C/Makefile @@ -1,10 +1,10 @@ PROGRAM=blurhash_encoder DECODER=blurhash_decoder $(PROGRAM): encode_stb.c encode.c encode.h stb_image.h common.h - $(CC) -o $@ encode_stb.c encode.c -lm -Ofast + $(CC) -o $@ encode_stb.c encode.c -lm -Ofast -Wall $(DECODER): decode_stb.c decode.c decode.h stb_writer.h common.h - $(CC) -o $(DECODER) decode_stb.c decode.c -lm -Ofast + $(CC) -o $(DECODER) decode_stb.c decode.c -lm -Ofast -Wall .PHONY: clean clean: From 7fe900e1ea8cd3827b7dd3924a7c1bbe0dce9a98 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 25 Sep 2024 11:37:09 +0400 Subject: [PATCH 04/14] Build object files separate for compilation speedup --- C/Makefile | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/C/Makefile b/C/Makefile index d8d0580e..cd02064e 100644 --- a/C/Makefile +++ b/C/Makefile @@ -1,12 +1,18 @@ PROGRAM=blurhash_encoder DECODER=blurhash_decoder -$(PROGRAM): encode_stb.c encode.c encode.h stb_image.h common.h - $(CC) -o $@ encode_stb.c encode.c -lm -Ofast -Wall -$(DECODER): decode_stb.c decode.c decode.h stb_writer.h common.h - $(CC) -o $(DECODER) decode_stb.c decode.c -lm -Ofast -Wall +encod%.o: encod%.c encode.h stb_image.h common.h + $(CC) -c $< -o $@ -Ofast -Wall +$(PROGRAM): encode_stb.o encode.o + $(CC) -o $@ encode_stb.o encode.o -lm + +decod%.o: decod%.c decode.h stb_writer.h common.h + $(CC) -c $< -o $@ -Ofast -Wall +$(DECODER): decode_stb.o decode.o + $(CC) -o $@ decode_stb.o decode.o -lm .PHONY: clean clean: rm -f $(PROGRAM) - rm -f $(DECODER) \ No newline at end of file + rm -f $(DECODER) + rm -f *.o From 6af77c915df1b5115be304584d2d7654145bda73 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 25 Sep 2024 10:57:06 +0400 Subject: [PATCH 05/14] Use sRGBToLinear_cache (4.5x speedup) --- C/encode.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/C/encode.c b/C/encode.c index c7a39da2..d7fc7436 100644 --- a/C/encode.c +++ b/C/encode.c @@ -9,6 +9,18 @@ static char *encode_int(int value, int length, char *destination); static int encodeDC(float r, float g, float b); static int encodeAC(float r, float g, float b, float maximumValue); +float *sRGBToLinear_cache = NULL; + +static void init_sRGBToLinear_cache() { + if (sRGBToLinear_cache != NULL) { + return; + } + sRGBToLinear_cache = (float *)malloc(sizeof(float) * 256); + for (int x = 0; x < 256; x++) { + sRGBToLinear_cache[x] = sRGBToLinear(x); + } +} + const char *blurHashForPixels(int xComponents, int yComponents, int width, int height, uint8_t *rgb, size_t bytesPerRow) { static char buffer[2 + 4 + (9 * 9 - 1) * 2 + 1]; @@ -18,6 +30,8 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h float factors[yComponents][xComponents][3]; memset(factors, 0, sizeof(factors)); + init_sRGBToLinear_cache(); + for(int y = 0; y < yComponents; y++) { for(int x = 0; x < xComponents; x++) { float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow); @@ -68,9 +82,9 @@ static float *multiplyBasisFunction(int xComponent, int yComponent, int width, i for(int y = 0; y < height; y++) { for(int x = 0; x < width; x++) { float basis = cosf(M_PI * xComponent * x / width) * cosf(M_PI * yComponent * y / height); - r += basis * sRGBToLinear(rgb[3 * x + 0 + y * bytesPerRow]); - g += basis * sRGBToLinear(rgb[3 * x + 1 + y * bytesPerRow]); - b += basis * sRGBToLinear(rgb[3 * x + 2 + y * bytesPerRow]); + r += basis * sRGBToLinear_cache[rgb[3 * x + 0 + y * bytesPerRow]]; + g += basis * sRGBToLinear_cache[rgb[3 * x + 1 + y * bytesPerRow]]; + b += basis * sRGBToLinear_cache[rgb[3 * x + 2 + y * bytesPerRow]]; } } From d936afb7bd77c62eb812577b726a61cea22130d3 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 25 Sep 2024 11:05:45 +0400 Subject: [PATCH 06/14] cosX cache (5.6x speedup) --- C/encode.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/C/encode.c b/C/encode.c index d7fc7436..f3ea9dec 100644 --- a/C/encode.c +++ b/C/encode.c @@ -3,7 +3,9 @@ #include -static float *multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow); +static float *multiplyBasisFunction( + int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, + float *cosX); static char *encode_int(int value, int length, char *destination); static int encodeDC(float r, float g, float b); @@ -32,14 +34,17 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h init_sRGBToLinear_cache(); + float *cosX = (float *)malloc(sizeof(float) * width); + if (! cosX) return NULL; for(int y = 0; y < yComponents; y++) { for(int x = 0; x < xComponents; x++) { - float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow); + float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow, cosX); factors[y][x][0] = factor[0]; factors[y][x][1] = factor[1]; factors[y][x][2] = factor[2]; } } + free(cosX); float *dc = factors[0][0]; float *ac = dc + 3; @@ -75,16 +80,25 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h return buffer; } -static float *multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow) { +static float *multiplyBasisFunction( + int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, + float *cosX +) { float r = 0, g = 0, b = 0; float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2; + for(int x = 0; x < width; x++) { + cosX[x] = cosf(M_PI * xComponent * x / width); + } + for(int y = 0; y < height; y++) { + uint8_t *src = rgb + y * bytesPerRow; + float cosY = cosf(M_PI * yComponent * y / height); for(int x = 0; x < width; x++) { - float basis = cosf(M_PI * xComponent * x / width) * cosf(M_PI * yComponent * y / height); - r += basis * sRGBToLinear_cache[rgb[3 * x + 0 + y * bytesPerRow]]; - g += basis * sRGBToLinear_cache[rgb[3 * x + 1 + y * bytesPerRow]]; - b += basis * sRGBToLinear_cache[rgb[3 * x + 2 + y * bytesPerRow]]; + float basis = cosX[x] * cosY; + r += basis * sRGBToLinear_cache[src[3 * x + 0]]; + g += basis * sRGBToLinear_cache[src[3 * x + 1]]; + b += basis * sRGBToLinear_cache[src[3 * x + 2]]; } } From 2e19ea79948933c89d8e022c808e5279032e1c08 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 25 Sep 2024 12:03:30 +0400 Subject: [PATCH 07/14] Prepare cosX && cosY once for all passes --- C/encode.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/C/encode.c b/C/encode.c index f3ea9dec..466dea9a 100644 --- a/C/encode.c +++ b/C/encode.c @@ -5,7 +5,7 @@ static float *multiplyBasisFunction( int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, - float *cosX); + float *cosX, float *cosY); static char *encode_int(int value, int length, char *destination); static int encodeDC(float r, float g, float b); @@ -34,17 +34,32 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h init_sRGBToLinear_cache(); - float *cosX = (float *)malloc(sizeof(float) * width); + float *cosX = (float *)malloc(sizeof(float) * width * xComponents); if (! cosX) return NULL; + float *cosY = (float *)malloc(sizeof(float) * height); + if (! cosY) { + free(cosX); + return NULL; + } + for(int x = 0; x < xComponents; x++) { + for(int i = 0; i < width; i++) { + cosX[x * width + i] = cosf(M_PI * x * i / width); + } + } for(int y = 0; y < yComponents; y++) { + for(int i = 0; i < height; i++) { + cosY[i] = cosf(M_PI * y * i / height); + } for(int x = 0; x < xComponents; x++) { - float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow, cosX); + float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow, + cosX + x * width, cosY); factors[y][x][0] = factor[0]; factors[y][x][1] = factor[1]; factors[y][x][2] = factor[2]; } } free(cosX); + free(cosY); float *dc = factors[0][0]; float *ac = dc + 3; @@ -82,20 +97,15 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h static float *multiplyBasisFunction( int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, - float *cosX + float *cosX, float *cosY ) { float r = 0, g = 0, b = 0; float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2; - for(int x = 0; x < width; x++) { - cosX[x] = cosf(M_PI * xComponent * x / width); - } - for(int y = 0; y < height; y++) { uint8_t *src = rgb + y * bytesPerRow; - float cosY = cosf(M_PI * yComponent * y / height); for(int x = 0; x < width; x++) { - float basis = cosX[x] * cosY; + float basis = cosY[y] * cosX[x]; r += basis * sRGBToLinear_cache[src[3 * x + 0]]; g += basis * sRGBToLinear_cache[src[3 * x + 1]]; b += basis * sRGBToLinear_cache[src[3 * x + 2]]; From d3d26c1b9d2df59004789b3b49ca2e65922cabb5 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 25 Sep 2024 12:26:51 +0400 Subject: [PATCH 08/14] Calculate factors in one call (up to 1.6x speedup) --- C/encode.c | 79 +++++++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/C/encode.c b/C/encode.c index 466dea9a..39309bb7 100644 --- a/C/encode.c +++ b/C/encode.c @@ -3,8 +3,8 @@ #include -static float *multiplyBasisFunction( - int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, +static void multiplyBasisFunction( + float factors[][3], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow, float *cosX, float *cosY); static char *encode_int(int value, int length, char *destination); @@ -29,41 +29,42 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h if(xComponents < 1 || xComponents > 9) return NULL; if(yComponents < 1 || yComponents > 9) return NULL; - float factors[yComponents][xComponents][3]; + float factors[yComponents * xComponents][3]; + int factorsCount = xComponents * yComponents; memset(factors, 0, sizeof(factors)); init_sRGBToLinear_cache(); - float *cosX = (float *)malloc(sizeof(float) * width * xComponents); + float *cosX = (float *)malloc(sizeof(float) * width * factorsCount); if (! cosX) return NULL; - float *cosY = (float *)malloc(sizeof(float) * height); + float *cosY = (float *)malloc(sizeof(float) * height * factorsCount); if (! cosY) { free(cosX); return NULL; } - for(int x = 0; x < xComponents; x++) { - for(int i = 0; i < width; i++) { - cosX[x * width + i] = cosf(M_PI * x * i / width); + for(int i = 0; i < width; i++) { + for(int x = 0; x < xComponents; x++) { + float weight = cosf(M_PI * x * i / width); + for(int y = 0; y < yComponents; y++) { + cosX[i * factorsCount + y * xComponents + x] = weight; + } } } - for(int y = 0; y < yComponents; y++) { - for(int i = 0; i < height; i++) { - cosY[i] = cosf(M_PI * y * i / height); - } - for(int x = 0; x < xComponents; x++) { - float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow, - cosX + x * width, cosY); - factors[y][x][0] = factor[0]; - factors[y][x][1] = factor[1]; - factors[y][x][2] = factor[2]; + for(int i = 0; i < height; i++) { + for(int y = 0; y < yComponents; y++) { + float weight = cosf(M_PI * y * i / height); + for(int x = 0; x < xComponents; x++) { + cosY[i * factorsCount + y * xComponents + x] = weight; + } } } + multiplyBasisFunction(factors, factorsCount, width, height, rgb, bytesPerRow, cosX, cosY); free(cosX); free(cosY); - float *dc = factors[0][0]; + float *dc = factors[0]; float *ac = dc + 3; - int acCount = xComponents * yComponents - 1; + int acCount = factorsCount - 1; char *ptr = buffer; int sizeFlag = (xComponents - 1) + (yComponents - 1) * 9; @@ -95,31 +96,35 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h return buffer; } -static float *multiplyBasisFunction( - int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow, +static void multiplyBasisFunction( + float factors[][3], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow, float *cosX, float *cosY ) { - float r = 0, g = 0, b = 0; - float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2; - for(int y = 0; y < height; y++) { uint8_t *src = rgb + y * bytesPerRow; + float *cosYLocal = cosY + y * factorsCount; for(int x = 0; x < width; x++) { - float basis = cosY[y] * cosX[x]; - r += basis * sRGBToLinear_cache[src[3 * x + 0]]; - g += basis * sRGBToLinear_cache[src[3 * x + 1]]; - b += basis * sRGBToLinear_cache[src[3 * x + 2]]; + float pixel[3]; + float *cosXLocal = cosX + x * factorsCount; + pixel[0] = sRGBToLinear_cache[src[3 * x + 0]]; + pixel[1] = sRGBToLinear_cache[src[3 * x + 1]]; + pixel[2] = sRGBToLinear_cache[src[3 * x + 2]]; + for (int i = 0; i < factorsCount; i++) { + float basis = cosYLocal[i] * cosXLocal[i]; + factors[i][0] += basis * pixel[0]; + factors[i][1] += basis * pixel[1]; + factors[i][2] += basis * pixel[2]; + } } } - float scale = normalisation / (width * height); - - static float result[3]; - result[0] = r * scale; - result[1] = g * scale; - result[2] = b * scale; - - return result; + for (int i = 0; i < factorsCount; i++) { + float normalisation = (i == 0) ? 1 : 2; + float scale = normalisation / (width * height); + factors[i][0] *= scale; + factors[i][1] *= scale; + factors[i][2] *= scale; + } } From 52d4a62dba8bd5ad2affb9769507c4f7ca93c213 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Thu, 26 Sep 2024 21:01:07 +0400 Subject: [PATCH 09/14] unroll multiplyBasisFunction loop (2.5x speedup) --- C/encode.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/C/encode.c b/C/encode.c index 39309bb7..0e6dfdc3 100644 --- a/C/encode.c +++ b/C/encode.c @@ -4,7 +4,7 @@ #include static void multiplyBasisFunction( - float factors[][3], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow, + float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow, float *cosX, float *cosY); static char *encode_int(int value, int length, char *destination); @@ -29,7 +29,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h if(xComponents < 1 || xComponents > 9) return NULL; if(yComponents < 1 || yComponents > 9) return NULL; - float factors[yComponents * xComponents][3]; + float factors[yComponents * xComponents][4]; int factorsCount = xComponents * yComponents; memset(factors, 0, sizeof(factors)); @@ -63,7 +63,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h free(cosY); float *dc = factors[0]; - float *ac = dc + 3; + float *ac = dc + 4; int acCount = factorsCount - 1; char *ptr = buffer; @@ -73,7 +73,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h float maximumValue; if(acCount > 0) { float actualMaximumValue = 0; - for(int i = 0; i < acCount * 3; i++) { + for(int i = 0; i < acCount * 4; i++) { actualMaximumValue = fmaxf(fabsf(ac[i]), actualMaximumValue); } @@ -88,7 +88,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h ptr = encode_int(encodeDC(dc[0], dc[1], dc[2]), 4, ptr); for(int i = 0; i < acCount; i++) { - ptr = encode_int(encodeAC(ac[i * 3 + 0], ac[i * 3 + 1], ac[i * 3 + 2], maximumValue), 2, ptr); + ptr = encode_int(encodeAC(ac[i * 4 + 0], ac[i * 4 + 1], ac[i * 4 + 2], maximumValue), 2, ptr); } *ptr = 0; @@ -97,14 +97,31 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h } static void multiplyBasisFunction( - float factors[][3], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow, + float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow, float *cosX, float *cosY ) { for(int y = 0; y < height; y++) { uint8_t *src = rgb + y * bytesPerRow; float *cosYLocal = cosY + y * factorsCount; - for(int x = 0; x < width; x++) { - float pixel[3]; + int x = 0; + for(; x < width - 3; x += 4) { + float *cosXLocal = cosX + x * factorsCount; + float pixel0[4] = {sRGBToLinear_cache[src[3 * (x+0) + 0]], sRGBToLinear_cache[src[3 * (x+0) + 1]], sRGBToLinear_cache[src[3 * (x+0) + 2]]}; + float pixel1[4] = {sRGBToLinear_cache[src[3 * (x+1) + 0]], sRGBToLinear_cache[src[3 * (x+1) + 1]], sRGBToLinear_cache[src[3 * (x+1) + 2]]}; + float pixel2[4] = {sRGBToLinear_cache[src[3 * (x+2) + 0]], sRGBToLinear_cache[src[3 * (x+2) + 1]], sRGBToLinear_cache[src[3 * (x+2) + 2]]}; + float pixel3[4] = {sRGBToLinear_cache[src[3 * (x+3) + 0]], sRGBToLinear_cache[src[3 * (x+3) + 1]], sRGBToLinear_cache[src[3 * (x+3) + 2]]}; + for (int i = 0; i < factorsCount; i++) { + float basis0 = cosYLocal[i] * cosXLocal[i + 0 * factorsCount]; + float basis1 = cosYLocal[i] * cosXLocal[i + 1 * factorsCount]; + float basis2 = cosYLocal[i] * cosXLocal[i + 2 * factorsCount]; + float basis3 = cosYLocal[i] * cosXLocal[i + 3 * factorsCount]; + factors[i][0] += basis0 * pixel0[0] + basis1 * pixel1[0] + basis2 * pixel2[0] + basis3 * pixel3[0]; + factors[i][1] += basis0 * pixel0[1] + basis1 * pixel1[1] + basis2 * pixel2[1] + basis3 * pixel3[1]; + factors[i][2] += basis0 * pixel0[2] + basis1 * pixel1[2] + basis2 * pixel2[2] + basis3 * pixel3[2]; + } + } + for(; x < width; x++) { + float pixel[4]; float *cosXLocal = cosX + x * factorsCount; pixel[0] = sRGBToLinear_cache[src[3 * x + 0]]; pixel[1] = sRGBToLinear_cache[src[3 * x + 1]]; From 89a2524ecbe86bcb5fc9942be65aa1d236f89d31 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Fri, 18 Oct 2024 12:43:32 +0400 Subject: [PATCH 10/14] Assign sRGBToLinear_cache after population to avoid races --- C/encode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/C/encode.c b/C/encode.c index 0e6dfdc3..b9ca6b7c 100644 --- a/C/encode.c +++ b/C/encode.c @@ -14,13 +14,16 @@ static int encodeAC(float r, float g, float b, float maximumValue); float *sRGBToLinear_cache = NULL; static void init_sRGBToLinear_cache() { + float *cache; if (sRGBToLinear_cache != NULL) { return; } - sRGBToLinear_cache = (float *)malloc(sizeof(float) * 256); + cache = (float *)malloc(sizeof(float) * 256); for (int x = 0; x < 256; x++) { - sRGBToLinear_cache[x] = sRGBToLinear(x); + cache[x] = sRGBToLinear(x); } + // Assign cache after population to avoid races + sRGBToLinear_cache = cache; } const char *blurHashForPixels(int xComponents, int yComponents, int width, int height, uint8_t *rgb, size_t bytesPerRow) { @@ -33,8 +36,6 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h int factorsCount = xComponents * yComponents; memset(factors, 0, sizeof(factors)); - init_sRGBToLinear_cache(); - float *cosX = (float *)malloc(sizeof(float) * width * factorsCount); if (! cosX) return NULL; float *cosY = (float *)malloc(sizeof(float) * height * factorsCount); @@ -100,6 +101,8 @@ static void multiplyBasisFunction( float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow, float *cosX, float *cosY ) { + init_sRGBToLinear_cache(); + for(int y = 0; y < height; y++) { uint8_t *src = rgb + y * bytesPerRow; float *cosYLocal = cosY + y * factorsCount; From 3ab38fec4a06db2d175a95194ee2d0f5506530c4 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 23 Oct 2024 19:00:12 +0400 Subject: [PATCH 11/14] decoder: cosf is about 17% faster --- C/decode.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/C/decode.c b/C/decode.c index a8cca05d..644a5b40 100644 --- a/C/decode.c +++ b/C/decode.c @@ -3,10 +3,8 @@ static char chars[83] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz#$%*+,-.:;=?@[]^_{|}~"; -static inline uint8_t clampToUByte(int * src) { - if( *src >= 0 && *src <= 255 ) - return *src; - return (*src < 0) ? 0 : 255; +static inline uint8_t clampToUByte(int src) { + return (src < 0) ? 0 : (src > 255) ? 255 : src; } static inline uint8_t * createByteArray(int size) { @@ -107,7 +105,7 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n for(j = 0; j < numY; j ++) { for(i = 0; i < numX; i ++) { - float basics = cos((M_PI * x * i) / width) * cos((M_PI * y * j) / height); + float basics = cosf((M_PI * x * i) / width) * cosf((M_PI * y * j) / height); int idx = i + j * numX; r += colors[idx][0] * basics; g += colors[idx][1] * basics; @@ -119,9 +117,9 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n intG = linearTosRGB(g); intB = linearTosRGB(b); - pixelArray[nChannels * x + 0 + y * bytesPerRow] = clampToUByte(&intR); - pixelArray[nChannels * x + 1 + y * bytesPerRow] = clampToUByte(&intG); - pixelArray[nChannels * x + 2 + y * bytesPerRow] = clampToUByte(&intB); + pixelArray[nChannels * x + 0 + y * bytesPerRow] = clampToUByte(intR); + pixelArray[nChannels * x + 1 + y * bytesPerRow] = clampToUByte(intG); + pixelArray[nChannels * x + 2 + y * bytesPerRow] = clampToUByte(intB); if (nChannels == 4) pixelArray[nChannels * x + 3 + y * bytesPerRow] = 255; // If nChannels=4, treat each pixel as RGBA instead of RGB From 6f02d6e134a4a9bc7bbd7207a79f3cab7a485506 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 23 Oct 2024 19:06:38 +0400 Subject: [PATCH 12/14] decoder: cache cos (3.2x faster) --- C/decode.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/C/decode.c b/C/decode.c index 644a5b40..7dcfb3aa 100644 --- a/C/decode.c +++ b/C/decode.c @@ -98,6 +98,19 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n int x = 0, y = 0, i = 0, j = 0; int intR = 0, intG = 0, intB = 0; + float *cosx = malloc(width * numX * sizeof(float)); + float *cosy = malloc(height * numX * sizeof(float)); + for(x = 0; x < width; x ++) { + for(i = 0; i < numX; i ++) { + cosx[x*numX + i] = cosf((M_PI * x * i) / width); + } + } + for(y = 0; y < height; y ++) { + for(j = 0; j < numY; j ++) { + cosy[y*numY + j] = cosf((M_PI * y * j) / height); + } + } + for(y = 0; y < height; y ++) { for(x = 0; x < width; x ++) { @@ -105,7 +118,7 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n for(j = 0; j < numY; j ++) { for(i = 0; i < numX; i ++) { - float basics = cosf((M_PI * x * i) / width) * cosf((M_PI * y * j) / height); + float basics = cosx[x*numX + i] * cosy[y*numY + j]; int idx = i + j * numX; r += colors[idx][0] * basics; g += colors[idx][1] * basics; @@ -127,6 +140,9 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n } } + free(cosx); + free(cosy); + return 0; } From 0f2d8c90627597869df1217186f6d988afa72476 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 23 Oct 2024 19:19:15 +0400 Subject: [PATCH 13/14] decode: unroll inner loop (20% faster) --- C/decode.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/C/decode.c b/C/decode.c index 7dcfb3aa..f7eda246 100644 --- a/C/decode.c +++ b/C/decode.c @@ -98,16 +98,22 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n int x = 0, y = 0, i = 0, j = 0; int intR = 0, intG = 0, intB = 0; - float *cosx = malloc(width * numX * sizeof(float)); - float *cosy = malloc(height * numX * sizeof(float)); + float *cosX = malloc(width * colors_size * sizeof(float)); + float *cosY = malloc(height * colors_size * sizeof(float)); for(x = 0; x < width; x ++) { for(i = 0; i < numX; i ++) { - cosx[x*numX + i] = cosf((M_PI * x * i) / width); + float weight = cosf(M_PI * x * i / width); + for(j = 0; j < numY; j ++) { + cosX[x * colors_size + j * numX + i] = weight; + } } } for(y = 0; y < height; y ++) { for(j = 0; j < numY; j ++) { - cosy[y*numY + j] = cosf((M_PI * y * j) / height); + float weight = cosf((M_PI * y * j) / height); + for(i = 0; i < numX; i ++) { + cosY[y * colors_size + j * numX + i] = weight; + } } } @@ -116,14 +122,11 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n float r = 0, g = 0, b = 0; - for(j = 0; j < numY; j ++) { - for(i = 0; i < numX; i ++) { - float basics = cosx[x*numX + i] * cosy[y*numY + j]; - int idx = i + j * numX; - r += colors[idx][0] * basics; - g += colors[idx][1] * basics; - b += colors[idx][2] * basics; - } + for (int idx = 0; idx < colors_size; idx ++) { + float basics = cosX[x * colors_size + idx] * cosY[y * colors_size + idx]; + r += colors[idx][0] * basics; + g += colors[idx][1] * basics; + b += colors[idx][2] * basics; } intR = linearTosRGB(r); @@ -140,8 +143,8 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n } } - free(cosx); - free(cosy); + free(cosX); + free(cosY); return 0; } From d2a09cd8671ea5d2b239a078f83ea1e554975a02 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Wed, 23 Oct 2024 19:40:45 +0400 Subject: [PATCH 14/14] decode: Cache linearTosRGB (2.75x speedup) --- C/decode.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/C/decode.c b/C/decode.c index f7eda246..640a3112 100644 --- a/C/decode.c +++ b/C/decode.c @@ -3,8 +3,26 @@ static char chars[83] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz#$%*+,-.:;=?@[]^_{|}~"; -static inline uint8_t clampToUByte(int src) { - return (src < 0) ? 0 : (src > 255) ? 255 : src; +#define CACHE_ACCURACY 8191 + +static inline int convertToCacheIdx(float src) { + int res = src * CACHE_ACCURACY + 0.5; + return (res < 0) ? 0 : (res > CACHE_ACCURACY) ? CACHE_ACCURACY : res; +} + +uint8_t *linearTosRGB_cache = NULL; + +static void init_linearTosRGB_cache() { + uint8_t *cache; + if (linearTosRGB_cache != NULL) { + return; + } + cache = (uint8_t *)malloc(sizeof(uint8_t) * (CACHE_ACCURACY + 1)); + for (int x = 0; x <= CACHE_ACCURACY; x++) { + cache[x] = linearTosRGB((float)x / CACHE_ACCURACY); + } + // Assign cache after population to avoid races + linearTosRGB_cache = cache; } static inline uint8_t * createByteArray(int size) { @@ -96,7 +114,6 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n int bytesPerRow = width * nChannels; int x = 0, y = 0, i = 0, j = 0; - int intR = 0, intG = 0, intB = 0; float *cosX = malloc(width * colors_size * sizeof(float)); float *cosY = malloc(height * colors_size * sizeof(float)); @@ -117,6 +134,8 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n } } + init_linearTosRGB_cache(); + for(y = 0; y < height; y ++) { for(x = 0; x < width; x ++) { @@ -129,13 +148,9 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n b += colors[idx][2] * basics; } - intR = linearTosRGB(r); - intG = linearTosRGB(g); - intB = linearTosRGB(b); - - pixelArray[nChannels * x + 0 + y * bytesPerRow] = clampToUByte(intR); - pixelArray[nChannels * x + 1 + y * bytesPerRow] = clampToUByte(intG); - pixelArray[nChannels * x + 2 + y * bytesPerRow] = clampToUByte(intB); + pixelArray[nChannels * x + 0 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(r)]; + pixelArray[nChannels * x + 1 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(g)]; + pixelArray[nChannels * x + 2 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(b)]; if (nChannels == 4) pixelArray[nChannels * x + 3 + y * bytesPerRow] = 255; // If nChannels=4, treat each pixel as RGBA instead of RGB