From 9c138a93417561e8516db8f856b12481edca55ad Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 25 Sep 2024 10:56:22 +0400
Subject: [PATCH 01/14] Define math consts

---
 C/common.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/C/common.h b/C/common.h
index ce581442..029b9bee 100644
--- a/C/common.h
+++ b/C/common.h
@@ -1,11 +1,8 @@
 #ifndef __BLURHASH_COMMON_H__
 #define __BLURHASH_COMMON_H__
 
-#include<math.h>
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
+#define _USE_MATH_DEFINES
+#include <math.h>
 
 static inline int linearTosRGB(float value) {
 	float v = fmaxf(0, fminf(1, value));

From 3152f570cc586be07145485bf6ed1ffbd13fefda Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 25 Sep 2024 12:28:42 +0400
Subject: [PATCH 02/14] Fix number of arguments in blurhash_encoder

---
 C/encode_stb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/C/encode_stb.c b/C/encode_stb.c
index cd3e461a..811ca000 100644
--- a/C/encode_stb.c
+++ b/C/encode_stb.c
@@ -15,8 +15,8 @@ int main(int argc, const char **argv) {
 
 	int xComponents = atoi(argv[1]);
 	int yComponents = atoi(argv[2]);
-	if(xComponents < 1 || xComponents > 8 || yComponents < 1 || yComponents > 8) {
-		fprintf(stderr, "Component counts must be between 1 and 8.\n");
+	if(xComponents < 1 || xComponents > 9 || yComponents < 1 || yComponents > 9) {
+		fprintf(stderr, "Component counts must be between 1 and 9.\n");
 		return 1;
 	}
 

From b027b16e258d38468ceadb58779eb2a087714e78 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 25 Sep 2024 11:04:00 +0400
Subject: [PATCH 03/14] Show main warnings like unused variables

---
 C/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/C/Makefile b/C/Makefile
index 33dddaa4..d8d0580e 100644
--- a/C/Makefile
+++ b/C/Makefile
@@ -1,10 +1,10 @@
 PROGRAM=blurhash_encoder
 DECODER=blurhash_decoder
 $(PROGRAM): encode_stb.c encode.c encode.h stb_image.h common.h
-	$(CC) -o $@ encode_stb.c encode.c -lm -Ofast
+	$(CC) -o $@ encode_stb.c encode.c -lm -Ofast -Wall
 
 $(DECODER): decode_stb.c decode.c decode.h stb_writer.h common.h
-	$(CC) -o $(DECODER) decode_stb.c decode.c -lm -Ofast
+	$(CC) -o $(DECODER) decode_stb.c decode.c -lm -Ofast -Wall
 
 .PHONY: clean
 clean:

From 7fe900e1ea8cd3827b7dd3924a7c1bbe0dce9a98 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 25 Sep 2024 11:37:09 +0400
Subject: [PATCH 04/14] Build object files separate for compilation speedup

---
 C/Makefile | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/C/Makefile b/C/Makefile
index d8d0580e..cd02064e 100644
--- a/C/Makefile
+++ b/C/Makefile
@@ -1,12 +1,18 @@
 PROGRAM=blurhash_encoder
 DECODER=blurhash_decoder
-$(PROGRAM): encode_stb.c encode.c encode.h stb_image.h common.h
-	$(CC) -o $@ encode_stb.c encode.c -lm -Ofast -Wall
 
-$(DECODER): decode_stb.c decode.c decode.h stb_writer.h common.h
-	$(CC) -o $(DECODER) decode_stb.c decode.c -lm -Ofast -Wall
+encod%.o: encod%.c encode.h stb_image.h common.h
+	$(CC) -c $< -o $@ -Ofast -Wall
+$(PROGRAM): encode_stb.o encode.o
+	$(CC) -o $@ encode_stb.o encode.o -lm
+
+decod%.o: decod%.c decode.h stb_writer.h common.h
+	$(CC) -c $< -o $@ -Ofast -Wall
+$(DECODER): decode_stb.o decode.o
+	$(CC) -o $@ decode_stb.o decode.o -lm
 
 .PHONY: clean
 clean:
 	rm -f $(PROGRAM)
-	rm -f $(DECODER)
\ No newline at end of file
+	rm -f $(DECODER)
+	rm -f *.o

From 6af77c915df1b5115be304584d2d7654145bda73 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 25 Sep 2024 10:57:06 +0400
Subject: [PATCH 05/14] Use sRGBToLinear_cache (4.5x speedup)

---
 C/encode.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/C/encode.c b/C/encode.c
index c7a39da2..d7fc7436 100644
--- a/C/encode.c
+++ b/C/encode.c
@@ -9,6 +9,18 @@ static char *encode_int(int value, int length, char *destination);
 static int encodeDC(float r, float g, float b);
 static int encodeAC(float r, float g, float b, float maximumValue);
 
+float *sRGBToLinear_cache = NULL;
+
+static void init_sRGBToLinear_cache() {
+	if (sRGBToLinear_cache != NULL) {
+		return;
+	}
+	sRGBToLinear_cache = (float *)malloc(sizeof(float) * 256);
+	for (int x = 0; x < 256; x++) {
+		sRGBToLinear_cache[x] = sRGBToLinear(x);
+	}
+}
+
 const char *blurHashForPixels(int xComponents, int yComponents, int width, int height, uint8_t *rgb, size_t bytesPerRow) {
 	static char buffer[2 + 4 + (9 * 9 - 1) * 2 + 1];
 
@@ -18,6 +30,8 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	float factors[yComponents][xComponents][3];
 	memset(factors, 0, sizeof(factors));
 
+	init_sRGBToLinear_cache();
+
 	for(int y = 0; y < yComponents; y++) {
 		for(int x = 0; x < xComponents; x++) {
 			float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow);
@@ -68,9 +82,9 @@ static float *multiplyBasisFunction(int xComponent, int yComponent, int width, i
 	for(int y = 0; y < height; y++) {
 		for(int x = 0; x < width; x++) {
 			float basis = cosf(M_PI * xComponent * x / width) * cosf(M_PI * yComponent * y / height);
-			r += basis * sRGBToLinear(rgb[3 * x + 0 + y * bytesPerRow]);
-			g += basis * sRGBToLinear(rgb[3 * x + 1 + y * bytesPerRow]);
-			b += basis * sRGBToLinear(rgb[3 * x + 2 + y * bytesPerRow]);
+			r += basis * sRGBToLinear_cache[rgb[3 * x + 0 + y * bytesPerRow]];
+			g += basis * sRGBToLinear_cache[rgb[3 * x + 1 + y * bytesPerRow]];
+			b += basis * sRGBToLinear_cache[rgb[3 * x + 2 + y * bytesPerRow]];
 		}
 	}
 

From d936afb7bd77c62eb812577b726a61cea22130d3 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 25 Sep 2024 11:05:45 +0400
Subject: [PATCH 06/14] cosX cache (5.6x speedup)

---
 C/encode.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/C/encode.c b/C/encode.c
index d7fc7436..f3ea9dec 100644
--- a/C/encode.c
+++ b/C/encode.c
@@ -3,7 +3,9 @@
 
 #include <string.h>
 
-static float *multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow);
+static float *multiplyBasisFunction(
+	int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+	float *cosX);
 static char *encode_int(int value, int length, char *destination);
 
 static int encodeDC(float r, float g, float b);
@@ -32,14 +34,17 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 
 	init_sRGBToLinear_cache();
 
+	float *cosX = (float *)malloc(sizeof(float) * width);
+	if (! cosX) return NULL;
 	for(int y = 0; y < yComponents; y++) {
 		for(int x = 0; x < xComponents; x++) {
-			float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow);
+			float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow, cosX);
 			factors[y][x][0] = factor[0];
 			factors[y][x][1] = factor[1];
 			factors[y][x][2] = factor[2];
 		}
 	}
+	free(cosX);
 
 	float *dc = factors[0][0];
 	float *ac = dc + 3;
@@ -75,16 +80,25 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	return buffer;
 }
 
-static float *multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow) {
+static float *multiplyBasisFunction(
+	int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+	float *cosX
+) {
 	float r = 0, g = 0, b = 0;
 	float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2;
 
+	for(int x = 0; x < width; x++) {
+		cosX[x] = cosf(M_PI * xComponent * x / width);
+	}
+
 	for(int y = 0; y < height; y++) {
+		uint8_t *src = rgb + y * bytesPerRow;
+		float cosY = cosf(M_PI * yComponent * y / height);
 		for(int x = 0; x < width; x++) {
-			float basis = cosf(M_PI * xComponent * x / width) * cosf(M_PI * yComponent * y / height);
-			r += basis * sRGBToLinear_cache[rgb[3 * x + 0 + y * bytesPerRow]];
-			g += basis * sRGBToLinear_cache[rgb[3 * x + 1 + y * bytesPerRow]];
-			b += basis * sRGBToLinear_cache[rgb[3 * x + 2 + y * bytesPerRow]];
+			float basis = cosX[x] * cosY;
+			r += basis * sRGBToLinear_cache[src[3 * x + 0]];
+			g += basis * sRGBToLinear_cache[src[3 * x + 1]];
+			b += basis * sRGBToLinear_cache[src[3 * x + 2]];
 		}
 	}
 

From 2e19ea79948933c89d8e022c808e5279032e1c08 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 25 Sep 2024 12:03:30 +0400
Subject: [PATCH 07/14] Prepare cosX && cosY once for all passes

---
 C/encode.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/C/encode.c b/C/encode.c
index f3ea9dec..466dea9a 100644
--- a/C/encode.c
+++ b/C/encode.c
@@ -5,7 +5,7 @@
 
 static float *multiplyBasisFunction(
 	int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow,
-	float *cosX);
+	float *cosX, float *cosY);
 static char *encode_int(int value, int length, char *destination);
 
 static int encodeDC(float r, float g, float b);
@@ -34,17 +34,32 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 
 	init_sRGBToLinear_cache();
 
-	float *cosX = (float *)malloc(sizeof(float) * width);
+	float *cosX = (float *)malloc(sizeof(float) * width * xComponents);
 	if (! cosX) return NULL;
+	float *cosY = (float *)malloc(sizeof(float) * height);
+	if (! cosY) {
+		free(cosX);
+		return NULL;
+	}
+	for(int x = 0; x < xComponents; x++) {
+		for(int i = 0; i < width; i++) {
+			cosX[x * width + i] = cosf(M_PI * x * i / width);
+		}
+	}
 	for(int y = 0; y < yComponents; y++) {
+		for(int i = 0; i < height; i++) {
+			cosY[i] = cosf(M_PI * y * i / height);
+		}
 		for(int x = 0; x < xComponents; x++) {
-			float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow, cosX);
+			float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow,
+				cosX + x * width, cosY);
 			factors[y][x][0] = factor[0];
 			factors[y][x][1] = factor[1];
 			factors[y][x][2] = factor[2];
 		}
 	}
 	free(cosX);
+	free(cosY);
 
 	float *dc = factors[0][0];
 	float *ac = dc + 3;
@@ -82,20 +97,15 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 
 static float *multiplyBasisFunction(
 	int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow,
-	float *cosX
+	float *cosX, float *cosY
 ) {
 	float r = 0, g = 0, b = 0;
 	float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2;
 
-	for(int x = 0; x < width; x++) {
-		cosX[x] = cosf(M_PI * xComponent * x / width);
-	}
-
 	for(int y = 0; y < height; y++) {
 		uint8_t *src = rgb + y * bytesPerRow;
-		float cosY = cosf(M_PI * yComponent * y / height);
 		for(int x = 0; x < width; x++) {
-			float basis = cosX[x] * cosY;
+			float basis = cosY[y] * cosX[x];
 			r += basis * sRGBToLinear_cache[src[3 * x + 0]];
 			g += basis * sRGBToLinear_cache[src[3 * x + 1]];
 			b += basis * sRGBToLinear_cache[src[3 * x + 2]];

From d3d26c1b9d2df59004789b3b49ca2e65922cabb5 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 25 Sep 2024 12:26:51 +0400
Subject: [PATCH 08/14] Calculate factors in one call (up to 1.6x speedup)

---
 C/encode.c | 79 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/C/encode.c b/C/encode.c
index 466dea9a..39309bb7 100644
--- a/C/encode.c
+++ b/C/encode.c
@@ -3,8 +3,8 @@
 
 #include <string.h>
 
-static float *multiplyBasisFunction(
-	int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+static void multiplyBasisFunction(
+	float factors[][3], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
 	float *cosX, float *cosY);
 static char *encode_int(int value, int length, char *destination);
 
@@ -29,41 +29,42 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	if(xComponents < 1 || xComponents > 9) return NULL;
 	if(yComponents < 1 || yComponents > 9) return NULL;
 
-	float factors[yComponents][xComponents][3];
+	float factors[yComponents * xComponents][3];
+	int factorsCount = xComponents * yComponents;
 	memset(factors, 0, sizeof(factors));
 
 	init_sRGBToLinear_cache();
 
-	float *cosX = (float *)malloc(sizeof(float) * width * xComponents);
+	float *cosX = (float *)malloc(sizeof(float) * width * factorsCount);
 	if (! cosX) return NULL;
-	float *cosY = (float *)malloc(sizeof(float) * height);
+	float *cosY = (float *)malloc(sizeof(float) * height * factorsCount);
 	if (! cosY) {
 		free(cosX);
 		return NULL;
 	}
-	for(int x = 0; x < xComponents; x++) {
-		for(int i = 0; i < width; i++) {
-			cosX[x * width + i] = cosf(M_PI * x * i / width);
+	for(int i = 0; i < width; i++) {
+		for(int x = 0; x < xComponents; x++) {
+			float weight = cosf(M_PI * x * i / width);
+			for(int y = 0; y < yComponents; y++) {
+				cosX[i * factorsCount + y * xComponents + x] = weight;
+			}
 		}
 	}
-	for(int y = 0; y < yComponents; y++) {
-		for(int i = 0; i < height; i++) {
-			cosY[i] = cosf(M_PI * y * i / height);
-		}
-		for(int x = 0; x < xComponents; x++) {
-			float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow,
-				cosX + x * width, cosY);
-			factors[y][x][0] = factor[0];
-			factors[y][x][1] = factor[1];
-			factors[y][x][2] = factor[2];
+	for(int i = 0; i < height; i++) {
+		for(int y = 0; y < yComponents; y++) {
+			float weight = cosf(M_PI * y * i / height);
+			for(int x = 0; x < xComponents; x++) {
+				cosY[i * factorsCount + y * xComponents + x] = weight;
+			}
 		}
 	}
+	multiplyBasisFunction(factors, factorsCount, width, height, rgb, bytesPerRow, cosX, cosY);
 	free(cosX);
 	free(cosY);
 
-	float *dc = factors[0][0];
+	float *dc = factors[0];
 	float *ac = dc + 3;
-	int acCount = xComponents * yComponents - 1;
+	int acCount = factorsCount - 1;
 	char *ptr = buffer;
 
 	int sizeFlag = (xComponents - 1) + (yComponents - 1) * 9;
@@ -95,31 +96,35 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	return buffer;
 }
 
-static float *multiplyBasisFunction(
-	int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+static void multiplyBasisFunction(
+	float factors[][3], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
 	float *cosX, float *cosY
 ) {
-	float r = 0, g = 0, b = 0;
-	float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2;
-
 	for(int y = 0; y < height; y++) {
 		uint8_t *src = rgb + y * bytesPerRow;
+		float *cosYLocal = cosY + y * factorsCount;
 		for(int x = 0; x < width; x++) {
-			float basis = cosY[y] * cosX[x];
-			r += basis * sRGBToLinear_cache[src[3 * x + 0]];
-			g += basis * sRGBToLinear_cache[src[3 * x + 1]];
-			b += basis * sRGBToLinear_cache[src[3 * x + 2]];
+			float pixel[3];
+			float *cosXLocal = cosX + x * factorsCount;
+			pixel[0] = sRGBToLinear_cache[src[3 * x + 0]];
+			pixel[1] = sRGBToLinear_cache[src[3 * x + 1]];
+			pixel[2] = sRGBToLinear_cache[src[3 * x + 2]];
+			for (int i = 0; i < factorsCount; i++) {
+				float basis = cosYLocal[i] * cosXLocal[i];
+				factors[i][0] += basis * pixel[0];
+				factors[i][1] += basis * pixel[1];
+				factors[i][2] += basis * pixel[2];
+			}
 		}
 	}
 
-	float scale = normalisation / (width * height);
-
-	static float result[3];
-	result[0] = r * scale;
-	result[1] = g * scale;
-	result[2] = b * scale;
-
-	return result;
+	for (int i = 0; i < factorsCount; i++) {
+		float normalisation = (i == 0) ? 1 : 2;
+		float scale = normalisation / (width * height);
+		factors[i][0] *= scale;
+		factors[i][1] *= scale;
+		factors[i][2] *= scale;
+	}
 }
 
 

From 52d4a62dba8bd5ad2affb9769507c4f7ca93c213 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Thu, 26 Sep 2024 21:01:07 +0400
Subject: [PATCH 09/14] unroll multiplyBasisFunction loop (2.5x speedup)

---
 C/encode.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/C/encode.c b/C/encode.c
index 39309bb7..0e6dfdc3 100644
--- a/C/encode.c
+++ b/C/encode.c
@@ -4,7 +4,7 @@
 #include <string.h>
 
 static void multiplyBasisFunction(
-	float factors[][3], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+	float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
 	float *cosX, float *cosY);
 static char *encode_int(int value, int length, char *destination);
 
@@ -29,7 +29,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	if(xComponents < 1 || xComponents > 9) return NULL;
 	if(yComponents < 1 || yComponents > 9) return NULL;
 
-	float factors[yComponents * xComponents][3];
+	float factors[yComponents * xComponents][4];
 	int factorsCount = xComponents * yComponents;
 	memset(factors, 0, sizeof(factors));
 
@@ -63,7 +63,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	free(cosY);
 
 	float *dc = factors[0];
-	float *ac = dc + 3;
+	float *ac = dc + 4;
 	int acCount = factorsCount - 1;
 	char *ptr = buffer;
 
@@ -73,7 +73,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	float maximumValue;
 	if(acCount > 0) {
 		float actualMaximumValue = 0;
-		for(int i = 0; i < acCount * 3; i++) {
+		for(int i = 0; i < acCount * 4; i++) {
 			actualMaximumValue = fmaxf(fabsf(ac[i]), actualMaximumValue);
 		}
 
@@ -88,7 +88,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	ptr = encode_int(encodeDC(dc[0], dc[1], dc[2]), 4, ptr);
 
 	for(int i = 0; i < acCount; i++) {
-		ptr = encode_int(encodeAC(ac[i * 3 + 0], ac[i * 3 + 1], ac[i * 3 + 2], maximumValue), 2, ptr);
+		ptr = encode_int(encodeAC(ac[i * 4 + 0], ac[i * 4 + 1], ac[i * 4 + 2], maximumValue), 2, ptr);
 	}
 
 	*ptr = 0;
@@ -97,14 +97,31 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 }
 
 static void multiplyBasisFunction(
-	float factors[][3], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+	float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
 	float *cosX, float *cosY
 ) {
 	for(int y = 0; y < height; y++) {
 		uint8_t *src = rgb + y * bytesPerRow;
 		float *cosYLocal = cosY + y * factorsCount;
-		for(int x = 0; x < width; x++) {
-			float pixel[3];
+		int x = 0;
+		for(; x < width - 3; x += 4) {
+			float *cosXLocal = cosX + x * factorsCount;
+			float pixel0[4] = {sRGBToLinear_cache[src[3 * (x+0) + 0]], sRGBToLinear_cache[src[3 * (x+0) + 1]], sRGBToLinear_cache[src[3 * (x+0) + 2]]};
+			float pixel1[4] = {sRGBToLinear_cache[src[3 * (x+1) + 0]], sRGBToLinear_cache[src[3 * (x+1) + 1]], sRGBToLinear_cache[src[3 * (x+1) + 2]]};
+			float pixel2[4] = {sRGBToLinear_cache[src[3 * (x+2) + 0]], sRGBToLinear_cache[src[3 * (x+2) + 1]], sRGBToLinear_cache[src[3 * (x+2) + 2]]};
+			float pixel3[4] = {sRGBToLinear_cache[src[3 * (x+3) + 0]], sRGBToLinear_cache[src[3 * (x+3) + 1]], sRGBToLinear_cache[src[3 * (x+3) + 2]]};
+			for (int i = 0; i < factorsCount; i++) {
+				float basis0 = cosYLocal[i] * cosXLocal[i + 0 * factorsCount];
+				float basis1 = cosYLocal[i] * cosXLocal[i + 1 * factorsCount];
+				float basis2 = cosYLocal[i] * cosXLocal[i + 2 * factorsCount];
+				float basis3 = cosYLocal[i] * cosXLocal[i + 3 * factorsCount];
+				factors[i][0] += basis0 * pixel0[0] + basis1 * pixel1[0] + basis2 * pixel2[0] + basis3 * pixel3[0];
+				factors[i][1] += basis0 * pixel0[1] + basis1 * pixel1[1] + basis2 * pixel2[1] + basis3 * pixel3[1];
+				factors[i][2] += basis0 * pixel0[2] + basis1 * pixel1[2] + basis2 * pixel2[2] + basis3 * pixel3[2];
+			}
+		}
+		for(; x < width; x++) {
+			float pixel[4];
 			float *cosXLocal = cosX + x * factorsCount;
 			pixel[0] = sRGBToLinear_cache[src[3 * x + 0]];
 			pixel[1] = sRGBToLinear_cache[src[3 * x + 1]];

From 89a2524ecbe86bcb5fc9942be65aa1d236f89d31 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Fri, 18 Oct 2024 12:43:32 +0400
Subject: [PATCH 10/14] Assign sRGBToLinear_cache after population to avoid
 races

---
 C/encode.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/C/encode.c b/C/encode.c
index 0e6dfdc3..b9ca6b7c 100644
--- a/C/encode.c
+++ b/C/encode.c
@@ -14,13 +14,16 @@ static int encodeAC(float r, float g, float b, float maximumValue);
 float *sRGBToLinear_cache = NULL;
 
 static void init_sRGBToLinear_cache() {
+	float *cache;
 	if (sRGBToLinear_cache != NULL) {
 		return;
 	}
-	sRGBToLinear_cache = (float *)malloc(sizeof(float) * 256);
+	cache = (float *)malloc(sizeof(float) * 256);
 	for (int x = 0; x < 256; x++) {
-		sRGBToLinear_cache[x] = sRGBToLinear(x);
+		cache[x] = sRGBToLinear(x);
 	}
+	// Assign cache after population to avoid races
+	sRGBToLinear_cache = cache;
 }
 
 const char *blurHashForPixels(int xComponents, int yComponents, int width, int height, uint8_t *rgb, size_t bytesPerRow) {
@@ -33,8 +36,6 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	int factorsCount = xComponents * yComponents;
 	memset(factors, 0, sizeof(factors));
 
-	init_sRGBToLinear_cache();
-
 	float *cosX = (float *)malloc(sizeof(float) * width * factorsCount);
 	if (! cosX) return NULL;
 	float *cosY = (float *)malloc(sizeof(float) * height * factorsCount);
@@ -100,6 +101,8 @@ static void multiplyBasisFunction(
 	float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
 	float *cosX, float *cosY
 ) {
+	init_sRGBToLinear_cache();
+
 	for(int y = 0; y < height; y++) {
 		uint8_t *src = rgb + y * bytesPerRow;
 		float *cosYLocal = cosY + y * factorsCount;

From 3ab38fec4a06db2d175a95194ee2d0f5506530c4 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 23 Oct 2024 19:00:12 +0400
Subject: [PATCH 11/14] decoder: cosf is about 17% faster

---
 C/decode.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/C/decode.c b/C/decode.c
index a8cca05d..644a5b40 100644
--- a/C/decode.c
+++ b/C/decode.c
@@ -3,10 +3,8 @@
 
 static char chars[83] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz#$%*+,-.:;=?@[]^_{|}~";
 
-static inline uint8_t clampToUByte(int * src) {
-	if( *src >= 0 && *src <= 255 )
-		return *src;
-	return (*src < 0) ? 0 : 255;
+static inline uint8_t clampToUByte(int src) {
+	return (src < 0) ? 0 : (src > 255) ? 255 : src;
 }
 
 static inline uint8_t *  createByteArray(int size) {
@@ -107,7 +105,7 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 
 			for(j = 0; j < numY; j ++) {
 				for(i = 0; i < numX; i ++) {
-					float basics = cos((M_PI * x * i) / width) * cos((M_PI * y * j) / height);
+					float basics = cosf((M_PI * x * i) / width) * cosf((M_PI * y * j) / height);
 					int idx = i + j * numX;
 					r += colors[idx][0] * basics;
 					g += colors[idx][1] * basics;
@@ -119,9 +117,9 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 			intG = linearTosRGB(g);
 			intB = linearTosRGB(b);
 
-			pixelArray[nChannels * x + 0 + y * bytesPerRow] = clampToUByte(&intR);
-			pixelArray[nChannels * x + 1 + y * bytesPerRow] = clampToUByte(&intG);
-			pixelArray[nChannels * x + 2 + y * bytesPerRow] = clampToUByte(&intB);
+			pixelArray[nChannels * x + 0 + y * bytesPerRow] = clampToUByte(intR);
+			pixelArray[nChannels * x + 1 + y * bytesPerRow] = clampToUByte(intG);
+			pixelArray[nChannels * x + 2 + y * bytesPerRow] = clampToUByte(intB);
 
 			if (nChannels == 4)
 				pixelArray[nChannels * x + 3 + y * bytesPerRow] = 255;   // If nChannels=4, treat each pixel as RGBA instead of RGB

From 6f02d6e134a4a9bc7bbd7207a79f3cab7a485506 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 23 Oct 2024 19:06:38 +0400
Subject: [PATCH 12/14] decoder: cache cos (3.2x faster)

---
 C/decode.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/C/decode.c b/C/decode.c
index 644a5b40..7dcfb3aa 100644
--- a/C/decode.c
+++ b/C/decode.c
@@ -98,6 +98,19 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 	int x = 0, y = 0, i = 0, j = 0;
 	int intR = 0, intG = 0, intB = 0;
 
+	float *cosx = malloc(width * numX * sizeof(float));
+	float *cosy = malloc(height * numX * sizeof(float));
+	for(x = 0; x < width; x ++) {
+		for(i = 0; i < numX; i ++) {
+			cosx[x*numX + i] = cosf((M_PI * x * i) / width);
+		}
+	}
+	for(y = 0; y < height; y ++) {
+		for(j = 0; j < numY; j ++) {
+			cosy[y*numY + j] = cosf((M_PI * y * j) / height);
+		}
+	}
+
 	for(y = 0; y < height; y ++) {
 		for(x = 0; x < width; x ++) {
 
@@ -105,7 +118,7 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 
 			for(j = 0; j < numY; j ++) {
 				for(i = 0; i < numX; i ++) {
-					float basics = cosf((M_PI * x * i) / width) * cosf((M_PI * y * j) / height);
+					float basics = cosx[x*numX + i] * cosy[y*numY + j];
 					int idx = i + j * numX;
 					r += colors[idx][0] * basics;
 					g += colors[idx][1] * basics;
@@ -127,6 +140,9 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 		}
 	}
 
+	free(cosx);
+	free(cosy);
+
 	return 0;
 }
 

From 0f2d8c90627597869df1217186f6d988afa72476 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 23 Oct 2024 19:19:15 +0400
Subject: [PATCH 13/14] decode: unroll inner loop (20% faster)

---
 C/decode.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/C/decode.c b/C/decode.c
index 7dcfb3aa..f7eda246 100644
--- a/C/decode.c
+++ b/C/decode.c
@@ -98,16 +98,22 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 	int x = 0, y = 0, i = 0, j = 0;
 	int intR = 0, intG = 0, intB = 0;
 
-	float *cosx = malloc(width * numX * sizeof(float));
-	float *cosy = malloc(height * numX * sizeof(float));
+	float *cosX = malloc(width * colors_size * sizeof(float));
+	float *cosY = malloc(height * colors_size * sizeof(float));
 	for(x = 0; x < width; x ++) {
 		for(i = 0; i < numX; i ++) {
-			cosx[x*numX + i] = cosf((M_PI * x * i) / width);
+			float weight = cosf(M_PI * x * i / width);
+			for(j = 0; j < numY; j ++) {
+				cosX[x * colors_size + j * numX + i] = weight;
+			}
 		}
 	}
 	for(y = 0; y < height; y ++) {
 		for(j = 0; j < numY; j ++) {
-			cosy[y*numY + j] = cosf((M_PI * y * j) / height);
+			float weight = cosf((M_PI * y * j) / height);
+			for(i = 0; i < numX; i ++) {
+				cosY[y * colors_size + j * numX + i] = weight;
+			}
 		}
 	}
 
@@ -116,14 +122,11 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 
 			float r = 0, g = 0, b = 0;
 
-			for(j = 0; j < numY; j ++) {
-				for(i = 0; i < numX; i ++) {
-					float basics = cosx[x*numX + i] * cosy[y*numY + j];
-					int idx = i + j * numX;
-					r += colors[idx][0] * basics;
-					g += colors[idx][1] * basics;
-					b += colors[idx][2] * basics;
-				}
+			for (int idx = 0; idx < colors_size; idx ++) {
+				float basics = cosX[x * colors_size + idx] * cosY[y * colors_size + idx];
+				r += colors[idx][0] * basics;
+				g += colors[idx][1] * basics;
+				b += colors[idx][2] * basics;
 			}
 
 			intR = linearTosRGB(r);
@@ -140,8 +143,8 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 		}
 	}
 
-	free(cosx);
-	free(cosy);
+	free(cosX);
+	free(cosY);
 
 	return 0;
 }

From d2a09cd8671ea5d2b239a078f83ea1e554975a02 Mon Sep 17 00:00:00 2001
From: Aleksandr Karpinskii <homm86@gmail.com>
Date: Wed, 23 Oct 2024 19:40:45 +0400
Subject: [PATCH 14/14] decode: Cache linearTosRGB (2.75x speedup)

---
 C/decode.c | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/C/decode.c b/C/decode.c
index f7eda246..640a3112 100644
--- a/C/decode.c
+++ b/C/decode.c
@@ -3,8 +3,26 @@
 
 static char chars[83] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz#$%*+,-.:;=?@[]^_{|}~";
 
-static inline uint8_t clampToUByte(int src) {
-	return (src < 0) ? 0 : (src > 255) ? 255 : src;
+#define CACHE_ACCURACY 8191
+
+static inline int convertToCacheIdx(float src) {
+	int res = src * CACHE_ACCURACY + 0.5;
+	return (res < 0) ? 0 : (res > CACHE_ACCURACY) ? CACHE_ACCURACY : res;
+}
+
+uint8_t *linearTosRGB_cache = NULL;
+
+static void init_linearTosRGB_cache() {
+	uint8_t *cache;
+	if (linearTosRGB_cache != NULL) {
+		return;
+	}
+	cache = (uint8_t *)malloc(sizeof(uint8_t) * (CACHE_ACCURACY + 1));
+	for (int x = 0; x <= CACHE_ACCURACY; x++) {
+		cache[x] = linearTosRGB((float)x / CACHE_ACCURACY);
+	}
+	// Assign cache after population to avoid races
+	linearTosRGB_cache = cache;
 }
 
 static inline uint8_t *  createByteArray(int size) {
@@ -96,7 +114,6 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 
 	int bytesPerRow = width * nChannels;
 	int x = 0, y = 0, i = 0, j = 0;
-	int intR = 0, intG = 0, intB = 0;
 
 	float *cosX = malloc(width * colors_size * sizeof(float));
 	float *cosY = malloc(height * colors_size * sizeof(float));
@@ -117,6 +134,8 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 		}
 	}
 
+	init_linearTosRGB_cache();
+
 	for(y = 0; y < height; y ++) {
 		for(x = 0; x < width; x ++) {
 
@@ -129,13 +148,9 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 				b += colors[idx][2] * basics;
 			}
 
-			intR = linearTosRGB(r);
-			intG = linearTosRGB(g);
-			intB = linearTosRGB(b);
-
-			pixelArray[nChannels * x + 0 + y * bytesPerRow] = clampToUByte(intR);
-			pixelArray[nChannels * x + 1 + y * bytesPerRow] = clampToUByte(intG);
-			pixelArray[nChannels * x + 2 + y * bytesPerRow] = clampToUByte(intB);
+			pixelArray[nChannels * x + 0 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(r)];
+			pixelArray[nChannels * x + 1 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(g)];
+			pixelArray[nChannels * x + 2 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(b)];
 
 			if (nChannels == 4)
 				pixelArray[nChannels * x + 3 + y * bytesPerRow] = 255;   // If nChannels=4, treat each pixel as RGBA instead of RGB