diff --git a/C/Makefile b/C/Makefile
index 33dddaa4..cd02064e 100644
--- a/C/Makefile
+++ b/C/Makefile
@@ -1,12 +1,18 @@
 PROGRAM=blurhash_encoder
 DECODER=blurhash_decoder
-$(PROGRAM): encode_stb.c encode.c encode.h stb_image.h common.h
-	$(CC) -o $@ encode_stb.c encode.c -lm -Ofast
 
-$(DECODER): decode_stb.c decode.c decode.h stb_writer.h common.h
-	$(CC) -o $(DECODER) decode_stb.c decode.c -lm -Ofast
+encod%.o: encod%.c encode.h stb_image.h common.h
+	$(CC) -c $< -o $@ -Ofast -Wall
+$(PROGRAM): encode_stb.o encode.o
+	$(CC) -o $@ encode_stb.o encode.o -lm
+
+decod%.o: decod%.c decode.h stb_writer.h common.h
+	$(CC) -c $< -o $@ -Ofast -Wall
+$(DECODER): decode_stb.o decode.o
+	$(CC) -o $@ decode_stb.o decode.o -lm
 
 .PHONY: clean
 clean:
 	rm -f $(PROGRAM)
-	rm -f $(DECODER)
\ No newline at end of file
+	rm -f $(DECODER)
+	rm -f *.o
diff --git a/C/common.h b/C/common.h
index ce581442..029b9bee 100644
--- a/C/common.h
+++ b/C/common.h
@@ -1,11 +1,8 @@
 #ifndef __BLURHASH_COMMON_H__
 #define __BLURHASH_COMMON_H__
 
-#include<math.h>
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
+#define _USE_MATH_DEFINES
+#include <math.h>
 
 static inline int linearTosRGB(float value) {
 	float v = fmaxf(0, fminf(1, value));
diff --git a/C/decode.c b/C/decode.c
index a8cca05d..640a3112 100644
--- a/C/decode.c
+++ b/C/decode.c
@@ -3,10 +3,26 @@
 
 static char chars[83] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz#$%*+,-.:;=?@[]^_{|}~";
 
-static inline uint8_t clampToUByte(int * src) {
-	if( *src >= 0 && *src <= 255 )
-		return *src;
-	return (*src < 0) ? 0 : 255;
+#define CACHE_ACCURACY 8191
+
+static inline int convertToCacheIdx(float src) {
+	int res = src * CACHE_ACCURACY + 0.5;
+	return (res < 0) ? 0 : (res > CACHE_ACCURACY) ? CACHE_ACCURACY : res;
+}
+
+uint8_t *linearTosRGB_cache = NULL;
+
+static void init_linearTosRGB_cache() {
+	uint8_t *cache;
+	if (linearTosRGB_cache != NULL) {
+		return;
+	}
+	cache = (uint8_t *)malloc(sizeof(uint8_t) * (CACHE_ACCURACY + 1));
+	for (int x = 0; x <= CACHE_ACCURACY; x++) {
+		cache[x] = linearTosRGB((float)x / CACHE_ACCURACY);
+	}
+	// Assign cache after population to avoid races
+	linearTosRGB_cache = cache;
 }
 
 static inline uint8_t *  createByteArray(int size) {
@@ -98,30 +114,43 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 
 	int bytesPerRow = width * nChannels;
 	int x = 0, y = 0, i = 0, j = 0;
-	int intR = 0, intG = 0, intB = 0;
+
+	float *cosX = malloc(width * colors_size * sizeof(float));
+	float *cosY = malloc(height * colors_size * sizeof(float));
+	for(x = 0; x < width; x ++) {
+		for(i = 0; i < numX; i ++) {
+			float weight = cosf(M_PI * x * i / width);
+			for(j = 0; j < numY; j ++) {
+				cosX[x * colors_size + j * numX + i] = weight;
+			}
+		}
+	}
+	for(y = 0; y < height; y ++) {
+		for(j = 0; j < numY; j ++) {
+			float weight = cosf((M_PI * y * j) / height);
+			for(i = 0; i < numX; i ++) {
+				cosY[y * colors_size + j * numX + i] = weight;
+			}
+		}
+	}
+
+	init_linearTosRGB_cache();
 
 	for(y = 0; y < height; y ++) {
 		for(x = 0; x < width; x ++) {
 
 			float r = 0, g = 0, b = 0;
 
-			for(j = 0; j < numY; j ++) {
-				for(i = 0; i < numX; i ++) {
-					float basics = cos((M_PI * x * i) / width) * cos((M_PI * y * j) / height);
-					int idx = i + j * numX;
-					r += colors[idx][0] * basics;
-					g += colors[idx][1] * basics;
-					b += colors[idx][2] * basics;
-				}
+			for (int idx = 0; idx < colors_size; idx ++) {
+				float basics = cosX[x * colors_size + idx] * cosY[y * colors_size + idx];
+				r += colors[idx][0] * basics;
+				g += colors[idx][1] * basics;
+				b += colors[idx][2] * basics;
 			}
 
-			intR = linearTosRGB(r);
-			intG = linearTosRGB(g);
-			intB = linearTosRGB(b);
-
-			pixelArray[nChannels * x + 0 + y * bytesPerRow] = clampToUByte(&intR);
-			pixelArray[nChannels * x + 1 + y * bytesPerRow] = clampToUByte(&intG);
-			pixelArray[nChannels * x + 2 + y * bytesPerRow] = clampToUByte(&intB);
+			pixelArray[nChannels * x + 0 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(r)];
+			pixelArray[nChannels * x + 1 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(g)];
+			pixelArray[nChannels * x + 2 + y * bytesPerRow] = linearTosRGB_cache[convertToCacheIdx(b)];
 
 			if (nChannels == 4)
 				pixelArray[nChannels * x + 3 + y * bytesPerRow] = 255;   // If nChannels=4, treat each pixel as RGBA instead of RGB
@@ -129,6 +158,9 @@ int decodeToArray(const char * blurhash, int width, int height, int punch, int n
 		}
 	}
 
+	free(cosX);
+	free(cosY);
+
 	return 0;
 }
 
diff --git a/C/encode.c b/C/encode.c
index c7a39da2..b9ca6b7c 100644
--- a/C/encode.c
+++ b/C/encode.c
@@ -3,33 +3,69 @@
 
 #include <string.h>
 
-static float *multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow);
+static void multiplyBasisFunction(
+	float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+	float *cosX, float *cosY);
 static char *encode_int(int value, int length, char *destination);
 
 static int encodeDC(float r, float g, float b);
 static int encodeAC(float r, float g, float b, float maximumValue);
 
+float *sRGBToLinear_cache = NULL;
+
+static void init_sRGBToLinear_cache() {
+	float *cache;
+	if (sRGBToLinear_cache != NULL) {
+		return;
+	}
+	cache = (float *)malloc(sizeof(float) * 256);
+	for (int x = 0; x < 256; x++) {
+		cache[x] = sRGBToLinear(x);
+	}
+	// Assign cache after population to avoid races
+	sRGBToLinear_cache = cache;
+}
+
 const char *blurHashForPixels(int xComponents, int yComponents, int width, int height, uint8_t *rgb, size_t bytesPerRow) {
 	static char buffer[2 + 4 + (9 * 9 - 1) * 2 + 1];
 
 	if(xComponents < 1 || xComponents > 9) return NULL;
 	if(yComponents < 1 || yComponents > 9) return NULL;
 
-	float factors[yComponents][xComponents][3];
+	float factors[yComponents * xComponents][4];
+	int factorsCount = xComponents * yComponents;
 	memset(factors, 0, sizeof(factors));
 
-	for(int y = 0; y < yComponents; y++) {
+	float *cosX = (float *)malloc(sizeof(float) * width * factorsCount);
+	if (! cosX) return NULL;
+	float *cosY = (float *)malloc(sizeof(float) * height * factorsCount);
+	if (! cosY) {
+		free(cosX);
+		return NULL;
+	}
+	for(int i = 0; i < width; i++) {
 		for(int x = 0; x < xComponents; x++) {
-			float *factor = multiplyBasisFunction(x, y, width, height, rgb, bytesPerRow);
-			factors[y][x][0] = factor[0];
-			factors[y][x][1] = factor[1];
-			factors[y][x][2] = factor[2];
+			float weight = cosf(M_PI * x * i / width);
+			for(int y = 0; y < yComponents; y++) {
+				cosX[i * factorsCount + y * xComponents + x] = weight;
+			}
 		}
 	}
+	for(int i = 0; i < height; i++) {
+		for(int y = 0; y < yComponents; y++) {
+			float weight = cosf(M_PI * y * i / height);
+			for(int x = 0; x < xComponents; x++) {
+				cosY[i * factorsCount + y * xComponents + x] = weight;
+			}
+		}
+	}
+	multiplyBasisFunction(factors, factorsCount, width, height, rgb, bytesPerRow, cosX, cosY);
+	free(cosX);
+	free(cosY);
 
-	float *dc = factors[0][0];
-	float *ac = dc + 3;
-	int acCount = xComponents * yComponents - 1;
+	float *dc = factors[0];
+	float *ac = dc + 4;
+	int acCount = factorsCount - 1;
 	char *ptr = buffer;
 
 	int sizeFlag = (xComponents - 1) + (yComponents - 1) * 9;
@@ -38,7 +74,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	float maximumValue;
 	if(acCount > 0) {
 		float actualMaximumValue = 0;
-		for(int i = 0; i < acCount * 3; i++) {
+		for(int i = 0; i < acCount * 4; i++) {
 			actualMaximumValue = fmaxf(fabsf(ac[i]), actualMaximumValue);
 		}
 
@@ -53,7 +89,7 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	ptr = encode_int(encodeDC(dc[0], dc[1], dc[2]), 4, ptr);
 
 	for(int i = 0; i < acCount; i++) {
-		ptr = encode_int(encodeAC(ac[i * 3 + 0], ac[i * 3 + 1], ac[i * 3 + 2], maximumValue), 2, ptr);
+		ptr = encode_int(encodeAC(ac[i * 4 + 0], ac[i * 4 + 1], ac[i * 4 + 2], maximumValue), 2, ptr);
 	}
 
 	*ptr = 0;
@@ -61,27 +97,54 @@ const char *blurHashForPixels(int xComponents, int yComponents, int width, int h
 	return buffer;
 }
 
-static float *multiplyBasisFunction(int xComponent, int yComponent, int width, int height, uint8_t *rgb, size_t bytesPerRow) {
-	float r = 0, g = 0, b = 0;
-	float normalisation = (xComponent == 0 && yComponent == 0) ? 1 : 2;
+static void multiplyBasisFunction(
+	float factors[][4], int factorsCount, int width, int height, uint8_t *rgb, size_t bytesPerRow,
+	float *cosX, float *cosY
+) {
+	init_sRGBToLinear_cache();
 
 	for(int y = 0; y < height; y++) {
-		for(int x = 0; x < width; x++) {
-			float basis = cosf(M_PI * xComponent * x / width) * cosf(M_PI * yComponent * y / height);
-			r += basis * sRGBToLinear(rgb[3 * x + 0 + y * bytesPerRow]);
-			g += basis * sRGBToLinear(rgb[3 * x + 1 + y * bytesPerRow]);
-			b += basis * sRGBToLinear(rgb[3 * x + 2 + y * bytesPerRow]);
+		uint8_t *src = rgb + y * bytesPerRow;
+		float *cosYLocal = cosY + y * factorsCount;
+		int x = 0;
+		for(; x < width - 3; x += 4) {
+			float *cosXLocal = cosX + x * factorsCount;
+			float pixel0[4] = {sRGBToLinear_cache[src[3 * (x+0) + 0]], sRGBToLinear_cache[src[3 * (x+0) + 1]], sRGBToLinear_cache[src[3 * (x+0) + 2]]};
+			float pixel1[4] = {sRGBToLinear_cache[src[3 * (x+1) + 0]], sRGBToLinear_cache[src[3 * (x+1) + 1]], sRGBToLinear_cache[src[3 * (x+1) + 2]]};
+			float pixel2[4] = {sRGBToLinear_cache[src[3 * (x+2) + 0]], sRGBToLinear_cache[src[3 * (x+2) + 1]], sRGBToLinear_cache[src[3 * (x+2) + 2]]};
+			float pixel3[4] = {sRGBToLinear_cache[src[3 * (x+3) + 0]], sRGBToLinear_cache[src[3 * (x+3) + 1]], sRGBToLinear_cache[src[3 * (x+3) + 2]]};
+			for (int i = 0; i < factorsCount; i++) {
+				float basis0 = cosYLocal[i] * cosXLocal[i + 0 * factorsCount];
+				float basis1 = cosYLocal[i] * cosXLocal[i + 1 * factorsCount];
+				float basis2 = cosYLocal[i] * cosXLocal[i + 2 * factorsCount];
+				float basis3 = cosYLocal[i] * cosXLocal[i + 3 * factorsCount];
+				factors[i][0] += basis0 * pixel0[0] + basis1 * pixel1[0] + basis2 * pixel2[0] + basis3 * pixel3[0];
+				factors[i][1] += basis0 * pixel0[1] + basis1 * pixel1[1] + basis2 * pixel2[1] + basis3 * pixel3[1];
+				factors[i][2] += basis0 * pixel0[2] + basis1 * pixel1[2] + basis2 * pixel2[2] + basis3 * pixel3[2];
+			}
+		}
+		for(; x < width; x++) {
+			float pixel[4];
+			float *cosXLocal = cosX + x * factorsCount;
+			pixel[0] = sRGBToLinear_cache[src[3 * x + 0]];
+			pixel[1] = sRGBToLinear_cache[src[3 * x + 1]];
+			pixel[2] = sRGBToLinear_cache[src[3 * x + 2]];
+			for (int i = 0; i < factorsCount; i++) {
+				float basis = cosYLocal[i] * cosXLocal[i];
+				factors[i][0] += basis * pixel[0];
+				factors[i][1] += basis * pixel[1];
+				factors[i][2] += basis * pixel[2];
+			}
 		}
 	}
 
-	float scale = normalisation / (width * height);
-
-	static float result[3];
-	result[0] = r * scale;
-	result[1] = g * scale;
-	result[2] = b * scale;
-
-	return result;
+	for (int i = 0; i < factorsCount; i++) {
+		float normalisation = (i == 0) ? 1 : 2;
+		float scale = normalisation / (width * height);
+		factors[i][0] *= scale;
+		factors[i][1] *= scale;
+		factors[i][2] *= scale;
+	}
 }
 
 
diff --git a/C/encode_stb.c b/C/encode_stb.c
index cd3e461a..811ca000 100644
--- a/C/encode_stb.c
+++ b/C/encode_stb.c
@@ -15,8 +15,8 @@ int main(int argc, const char **argv) {
 
 	int xComponents = atoi(argv[1]);
 	int yComponents = atoi(argv[2]);
-	if(xComponents < 1 || xComponents > 8 || yComponents < 1 || yComponents > 8) {
-		fprintf(stderr, "Component counts must be between 1 and 8.\n");
+	if(xComponents < 1 || xComponents > 9 || yComponents < 1 || yComponents > 9) {
+		fprintf(stderr, "Component counts must be between 1 and 9.\n");
 		return 1;
 	}