From 27e3fe702769f350d62fe5669399e8901d949da6 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 19:57:15 +1000
Subject: [PATCH 1/8] Update CMake for conditional compilation

---
 CMakeLists.txt | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index faea7b0..0ead262 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,34 +1,42 @@
 cmake_minimum_required(VERSION 3.16)
 
-project(ichida-algo LANGUAGES C CXX)
+project(ichida-algo LANGUAGES C CUDA)
 
-set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra")
+set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra -Wpedantic")
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED True)
-set(CMAKE_VERBOSE_MAKEFILE ON)
 
-set(INC_DIR include)
-set(SRC_DIR src)
+set(SOURCE_DIR src)
 set(CUDA_SRC_DIR cudasrc)
 
-include_directories(${INC_DIR})
-
-file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c)
+file(GLOB SOURCE_FILES ${SOURCE_DIR}/*.c)
 
 add_executable(speed_cpu ${SOURCE_FILES})
 target_link_libraries(speed_cpu m pthread gomp)
 
 find_package(CUDA)
-
 if(CUDA_FOUND)
     enable_language(CUDA)
+    set(CMAKE_CUDA_ARCHITECTURES "80")
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80")
-    find_package(MPI REQUIRED)
-    include_directories(${MPI_INCLUDE_PATH})
     file(GLOB_RECURSE CUDA_SOURCE_FILES ${CUDA_SRC_DIR}/*.cu)
     add_executable(speed_gpu ${CUDA_SOURCE_FILES})
     set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    target_link_libraries(speed_gpu m ${MPI_LIBRARIES})
+
+    if(COMPILE_MPI)
+        find_package(MPI)
+        if(MPI_FOUND)
+            include_directories(${MPI_INCLUDE_PATH})
+            target_link_libraries(speed_gpu m ${MPI_LIBRARIES})
+            set_property(TARGET speed_gpu PROPERTY COMPILE_DEFINITIONS USE_MPI)
+        else()
+            message(STATUS "MPI not found. Please install library to compile with MPI enabled.")
+        endif(MPI_FOUND)
+        
+    else() 
+        target_link_libraries(speed_gpu m)
+    endif()
+
 else()
     message(STATUS "CUDA not found, only CPU version will be built.")
 endif()

From 7ea1a82825354103f10d4bc671f5345955418262 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 19:57:31 +1000
Subject: [PATCH 2/8] Update Makefile for conditional compilation

---
 Makefile | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index 0253aee..b82c40b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
-.PHONY: all clean build run_cpu run_gpu test_cpu test_gpu bench stat
+.PHONY: all clean build build_mpi build_gpu test_cpu test_gpu bench stat
 
 # Default iterations
-iterations ?= 1000
+iterations ?= 100000
 
 all: build
 
@@ -11,26 +11,34 @@ clean:
 	rm -rf build
 	rm -f speed_cpu speed_gpu
 
-build: clean
+build_gpu: clean
 	cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
 	$(MAKE) -C build
 	cp -u build/speed_cpu ./
 	if [ -f build/speed_gpu ]; then cp -u build/speed_gpu ./; fi
 
-run_cpu: build
+build_mpi: clean
+	cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCOMPILE_MPI=True
+	$(MAKE) -C build
+	cp -u build/speed_cpu ./
+	if [ -f build/speed_gpu ]; then cp -u build/speed_gpu ./; fi
+
+build: build_mpi
+
+run_cpu:
 	./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)
 
-run_gpu: build
+run_gpu:
+	./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)
+
+run_mpi:
 	n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
 	mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)
 
-test_cpu: build
-	./speed_cpu ./weights_and_biases.txt ./tensors $(iterations)
+test_cpu: build run_cpu
 	mv ./results.csv ./test
 	python3 ./test/verify_csv.py
 
-test_gpu: build
-	n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
-	mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations)
+test_gpu: build run_mpi
 	mv ./results.csv ./test
 	python3 ./test/verify_csv.py
\ No newline at end of file

From 5e4f509b7e55fbd233e36a58cfe38ee97a4a45f7 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 19:57:58 +1000
Subject: [PATCH 3/8] CUDA general quality pass

---
 cudasrc/main.cu   | 168 ++++++++++++++++++++++++++--------------------
 cudasrc/matrix.cu |  13 +---
 2 files changed, 99 insertions(+), 82 deletions(-)

diff --git a/cudasrc/main.cu b/cudasrc/main.cu
index 732c948..516452c 100644
--- a/cudasrc/main.cu
+++ b/cudasrc/main.cu
@@ -1,32 +1,28 @@
 #include "matrix.cuh"
 #include <dirent.h>
-#include <mpi.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
 
 #define NUM_LAYERS 7
+#define TENSOR_LENGTH 225
 
-#define CUDA_CHECK(call)                                                                                               \
-    do {                                                                                                               \
-        cudaError_t err = call;                                                                                        \
-        if (err != cudaSuccess) {                                                                                      \
-            fprintf(stderr, "CUDA error in %s (%s:%d): %s\n", __func__, __FILE__, __LINE__, cudaGetErrorString(err));  \
-            exit(EXIT_FAILURE);                                                                                        \
-        }                                                                                                              \
-    } while (0)
+#define BLOCKS 108
+#define THREADS_PER_BLOCK 1024
 
 matrix* weights[NUM_LAYERS];
 matrix* biases[NUM_LAYERS];
+f32* inputs;
+int* results;
 
-// device weights and biases;
+// Device memory
 matrix** d_weights;
 matrix** d_biases;
-
-float* inputs;
-float* d_inputs;
-int* results;
+f32* d_inputs;
 int* d_results;
 
 char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i',
@@ -84,12 +80,16 @@ void read_model(const char* file_name) {
     fclose(file);
 }
 
-void read_tensor(float* a, const char* fileName) {
+void read_tensor(float* out, const char* fileName) {
     FILE* file = fopen(fileName, "r");
     char* line = NULL;
     size_t len = 0;
 
-    getline(&line, &len, file);
+    if (getline(&line, &len, file) == -1) {
+        perror("Could not read tensor file.\n");
+        exit(EXIT_FAILURE);
+    }
+
     char* token;
     float value;
     const char* delimiter = ",";
@@ -97,40 +97,51 @@ void read_tensor(float* a, const char* fileName) {
 
     for (int i = 0; i < 225; i++) {
         value = strtof(token, NULL);
-        a[i] = value;
+        out[i] = value;
         token = strtok(NULL, delimiter);
     }
     free(line);
     fclose(file);
 }
 
-__device__ void propagate_fwd(matrix* weights, float* input_layer, float* output_layer, matrix* biases) {
+int file_count(const char* dir_path) {
+    struct dirent* entry;
+    DIR* dir = opendir(dir_path);
+
+    // Count inputs
+    int num_inputs = 0;
+    while ((entry = readdir(dir)) != NULL) {
+        if (entry->d_type == DT_REG)
+            num_inputs++;
+    }
+
+    return num_inputs;
+}
+
+__device__ void propagate_fwd(matrix* weights, f32* input_layer, f32* output_layer, matrix* biases) {
     matrix_mul(weights->data, input_layer, output_layer, weights->rows, weights->cols);
     matrix_add(output_layer, biases->data, biases->rows);
 }
 
-#define BLOCKS 108
-#define THREADS_PER_BLOCK 1024
-
 __global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
                       int in_num) {
 
-    __shared__ float sharedInput[225];
+    __shared__ float shared_input[TENSOR_LENGTH];
     float out1[98];
     float out2[65];
 
     int num_threads = blockDim.x * gridDim.x;
     int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x);
 
-    float* input = (float*)&d_inputs[in_num * 225];
+    float* input = (float*)&d_inputs[in_num * TENSOR_LENGTH];
 
-    if (threadIdx.x < 225) {
-        sharedInput[threadIdx.x] = input[threadIdx.x];
+    if (threadIdx.x < TENSOR_LENGTH) {
+        shared_input[threadIdx.x] = input[threadIdx.x];
     }
     __syncthreads();
 
     for (int i = thread_idx; i < it_per_input; i += num_threads) {
-        propagate_fwd(d_weights[0], sharedInput, out1, d_biases[0]);
+        propagate_fwd(d_weights[0], shared_input, out1, d_biases[0]);
         relu(out1, 98);
 
         propagate_fwd(d_weights[1], out1, out2, d_biases[1]);
@@ -154,23 +165,30 @@ __global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matri
         d_results[in_num] = argmax(out1, 52);
     }
 }
+
 int main(int argc, char* argv[]) {
-    MPI_Init(&argc, &argv);
-    int totalProcess, processId;
-    MPI_Comm_size(MPI_COMM_WORLD, &totalProcess); // size
-    MPI_Comm_rank(MPI_COMM_WORLD, &processId);    // gpuid
 
     if (argc < 4) {
         printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/> <number_of_inferences>\n");
+#ifdef USE_MPI
         MPI_Finalize();
+#endif
         return EXIT_FAILURE;
     }
 
-    // get no of gpu
-    int deviceCount;
-    cudaGetDeviceCount(&deviceCount);
-    int deviceId = processId % deviceCount;
-    cudaSetDevice(deviceId);
+#ifdef USE_MPI
+    // Initialise GPU environment
+    MPI_Init(&argc, &argv);
+    int num_proccesses, process_id;
+    MPI_Comm_size(MPI_COMM_WORLD, &num_proccesses);
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_id);
+
+    int device_count;
+    cudaGetDeviceCount(&device_count);
+    int device_id = process_id % device_count;
+    cudaSetDevice(device_id);
+    printf("MPI device id: %d\n", device_id);
+#endif
 
     // Start timing
     struct timeval stop, start;
@@ -193,85 +211,93 @@ int main(int argc, char* argv[]) {
     biases[6] = new_matrix(52, 1);
     read_model(argv[1]);
 
-    CUDA_CHECK(cudaMalloc(&d_weights, NUM_LAYERS * sizeof(matrix*)));
-    CUDA_CHECK(cudaMalloc(&d_biases, NUM_LAYERS * sizeof(matrix*)));
+    // Copy model to GPU
+    cudaMalloc(&d_weights, NUM_LAYERS * sizeof(matrix*));
+    cudaMalloc(&d_biases, NUM_LAYERS * sizeof(matrix*));
     for (int i = 0; i < NUM_LAYERS; i++) {
-        matrix* a = copy_to_device(weights[i]);
-        matrix* b = copy_to_device(biases[i]);
-        CUDA_CHECK(cudaMemcpy(&(d_weights[i]), &a, sizeof(matrix*), cudaMemcpyHostToDevice));
-        CUDA_CHECK(cudaMemcpy(&(d_biases[i]), &b, sizeof(matrix*), cudaMemcpyHostToDevice));
+        matrix* layer_weight = copy_to_device(weights[i]);
+        matrix* layer_bias = copy_to_device(biases[i]);
+        cudaMemcpy(&(d_weights[i]), &layer_weight, sizeof(matrix*), cudaMemcpyHostToDevice);
+        cudaMemcpy(&(d_biases[i]), &layer_bias, sizeof(matrix*), cudaMemcpyHostToDevice);
     }
 
     const char* directory_path = argv[2];
-    struct dirent* entry;
-    DIR* dir = opendir(directory_path);
+    int input_count = file_count(directory_path);
+    int num_its = atoi(argv[3]);
 
+    results = (int*)malloc((input_count) * sizeof(int));
+    inputs = (f32*)malloc((input_count) * sizeof(f32) * TENSOR_LENGTH);
+    cudaMalloc(&d_results, (input_count) * sizeof(int));
+    cudaMalloc(&d_inputs, (input_count) * sizeof(float) * TENSOR_LENGTH);
+    
     // Read and process inputs
     char* file_name = (char*)malloc((100) * sizeof(char));
     char* file_num_str = (char*)malloc((100) * sizeof(char));
 
-    int file_num;
-    int input_count = 0;
-    while ((entry = readdir(dir)) != NULL) {
-        if (entry->d_type == DT_REG) {
-            input_count++;
-        }
-    }
-
-    results = (int*)malloc((input_count) * sizeof(int));
-    inputs = (float*)malloc((input_count) * sizeof(float) * 225);
-
-    cudaMalloc(&d_results, (input_count) * sizeof(int));
-    cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225);
-
+    struct dirent* entry;
+    DIR* dir = opendir(directory_path);
     dir = opendir(directory_path);
     while ((entry = readdir(dir)) != NULL) {
         if (entry->d_type == DT_REG) {
             strcpy(file_num_str, entry->d_name);
             file_num_str[strlen(entry->d_name) - 7] = '\0';
-            file_num = atoi(entry->d_name);
+            int file_num = atoi(entry->d_name);
             strcpy(file_name, directory_path);
             strcat(file_name, "/");
             strcat(file_name, entry->d_name);
-            read_tensor((float*)&inputs[(file_num - 1) * 225], file_name);
+            read_tensor((f32*)&inputs[(file_num - 1) * 225], file_name);
         }
     }
-
     free(file_name);
     free(file_num_str);
     closedir(dir);
 
+    // Move input array to GPU memory
     cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice);
 
-    int it_num = atoi(argv[3]);
-    int gpu_it_num = it_num / totalProcess + (processId < (it_num % totalProcess) ? 1 : 0);
+# ifdef USE_MPI
+    int it_per_gpu = num_its / num_proccesses + (process_id < (num_its % num_proccesses) ? 1 : 0);
+#else 
+    int it_per_gpu = num_its;
+#endif
 
-    struct timeval stop1, start1;
-    gettimeofday(&start1, NULL);
+    struct timeval stop_inf, start_inf;
+    gettimeofday(&start_inf, NULL);
 
     cudaDeviceSynchronize();
     for (int i = 0; i < input_count; i++) {
-        infer<<<BLOCKS, THREADS_PER_BLOCK>>>(d_inputs, d_results, d_weights, d_biases, gpu_it_num, i);
-        CUDA_CHECK(cudaGetLastError());
+        infer<<<BLOCKS, THREADS_PER_BLOCK>>>(d_inputs, d_results, d_weights, d_biases, it_per_gpu, i);
     }
     cudaDeviceSynchronize();
 
-    if (processId == 0) {
+#ifdef USE_MPI
+    if (process_id == 0) {
+#endif
         cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost);
-        gettimeofday(&stop1, NULL);
-        printf("Process %d - Inference: %lu us\n", processId,
-               (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec);
+        gettimeofday(&stop_inf, NULL);
+#ifdef USE_MPI
+        printf("Process %d - Inference: %lu us\n", process_id,
+               (stop_inf.tv_sec - start_inf.tv_sec) * 1000000 + stop_inf.tv_usec - start_inf.tv_usec);
+#endif
+
+        // Print output to csv
         FILE* csv_file = fopen("results.csv", "w+");
         fprintf(csv_file, "image_number, guess\n");
         for (int i = 0; i < input_count; i++) {
             fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]);
         }
         fclose(csv_file);
+#ifdef USE_MPI
     }
+#endif
+
     // Time taken
     gettimeofday(&stop, NULL);
-    printf("Process %d - Total: %lu us\n", processId,
+    printf("Total: %lu us\n",
            (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
+
+#ifdef USE_MPI
     MPI_Finalize();
+#endif
     return EXIT_SUCCESS;
 }
diff --git a/cudasrc/matrix.cu b/cudasrc/matrix.cu
index c223f6c..044ee4b 100644
--- a/cudasrc/matrix.cu
+++ b/cudasrc/matrix.cu
@@ -1,5 +1,4 @@
 #include "matrix.cuh"
-#include "util.cuh"
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -22,7 +21,7 @@ __global__ void alloc(matrix* res, float* data, int rows, int cols) {
 
 matrix* new_matrix_d(int rows, int cols) {
     matrix* res;
-    CUDA_CHECK(cudaMalloc(&res, sizeof(matrix)));
+    cudaMalloc(&res, sizeof(matrix));
     float* data;
     cudaMalloc(&data, rows * cols * sizeof(float));
     alloc<<<1, 1>>>(res, data, rows, cols);
@@ -31,7 +30,7 @@ matrix* new_matrix_d(int rows, int cols) {
 
 matrix* copy_to_device(matrix* h_mat) {
     matrix* res;
-    CUDA_CHECK(cudaMalloc(&res, sizeof(matrix)));
+    cudaMalloc(&res, sizeof(matrix));
     float* data;
     cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float));
     cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice);
@@ -78,14 +77,6 @@ __device__ void relu(float* a, int rows) {
     }
 }
 
-// Hacky but fast and accurate for existing inputs
-static __device__ inline float fastexp(float x) {
-    int tmp = (int)(1512775 * x + 1072632447);
-    float result;
-    memcpy(&result, &tmp, sizeof(result));
-    return result;
-}
-
 __device__ void softmax(float* a, int rows) {
     float sum = 0.0;
     for (size_t i = 0; i < rows; i++) {

From cb2683b34d0e0fa44b7cf71edcd2f9f091a6b2ec Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 19:58:21 +1000
Subject: [PATCH 4/8] CUDA headers general quality pass

---
 cudasrc/matrix.cuh | 2 ++
 cudasrc/util.cuh   | 8 --------
 2 files changed, 2 insertions(+), 8 deletions(-)
 delete mode 100644 cudasrc/util.cuh

diff --git a/cudasrc/matrix.cuh b/cudasrc/matrix.cuh
index b2191cb..4c583c7 100644
--- a/cudasrc/matrix.cuh
+++ b/cudasrc/matrix.cuh
@@ -1,5 +1,7 @@
 #pragma once
 
+typedef float f32;
+
 typedef struct {
     int rows;
     int cols;
diff --git a/cudasrc/util.cuh b/cudasrc/util.cuh
deleted file mode 100644
index a6f988a..0000000
--- a/cudasrc/util.cuh
+++ /dev/null
@@ -1,8 +0,0 @@
-#pragma once
-
-#define CUDA_CHECK(call)                                                                                               \
-    cudaError_t err = call;                                                                                            \
-    if (err != cudaSuccess) {                                                                                          \
-        fprintf(stderr, "CUDA error in %s (%s:%d): %s\n", __func__, __FILE__, __LINE__, cudaGetErrorString(err));      \
-        exit(EXIT_FAILURE);                                                                                            \
-    }
\ No newline at end of file

From bae32dc0eac1a30e3d4d2dfa720cca6225604707 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 19:58:35 +1000
Subject: [PATCH 5/8] Prevent warning when compiling

---
 src/file_io.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/file_io.c b/src/file_io.c
index 0b9847f..936f6a4 100644
--- a/src/file_io.c
+++ b/src/file_io.c
@@ -74,7 +74,11 @@ void read_tensor(f32* a, const char* file_name) {
     char* line = NULL;
     size_t len = 0;
 
-    getline(&line, &len, file);
+    if (getline(&line, &len, file) == -1) {
+        perror("Could not read tensor file. Exiting.");
+        exit(EXIT_FAILURE);
+    } 
+    
     char* token;
     float value;
     const char* delimiter = ",";

From 327d37ea9e988669cb0b76f60dd6840edc12a1e2 Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 20:01:50 +1000
Subject: [PATCH 6/8] Type naming consistency

---
 cudasrc/main.cu    | 22 +++++++++++-----------
 cudasrc/matrix.cu  | 36 ++++++++++++++++++------------------
 cudasrc/matrix.cuh | 12 ++++++------
 src/file_io.c      |  8 ++++----
 src/main.c         |  6 +++---
 src/matrix.c       | 12 ++++++------
 6 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/cudasrc/main.cu b/cudasrc/main.cu
index 516452c..2f17a96 100644
--- a/cudasrc/main.cu
+++ b/cudasrc/main.cu
@@ -31,7 +31,7 @@ char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f',
 
 void process_weights_str(char* line, int layer) {
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
 
     token = strtok(line, delimiter);
@@ -45,7 +45,7 @@ void process_weights_str(char* line, int layer) {
 
 void process_biases_str(char* line, int layer) {
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
 
     token = strtok(line, delimiter);
@@ -80,7 +80,7 @@ void read_model(const char* file_name) {
     fclose(file);
 }
 
-void read_tensor(float* out, const char* fileName) {
+void read_tensor(f32* out, const char* fileName) {
     FILE* file = fopen(fileName, "r");
     char* line = NULL;
     size_t len = 0;
@@ -91,7 +91,7 @@ void read_tensor(float* out, const char* fileName) {
     }
 
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
     token = strtok(line, delimiter);
 
@@ -123,17 +123,17 @@ __device__ void propagate_fwd(matrix* weights, f32* input_layer, f32* output_lay
     matrix_add(output_layer, biases->data, biases->rows);
 }
 
-__global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
+__global__ void infer(f32* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
                       int in_num) {
 
-    __shared__ float shared_input[TENSOR_LENGTH];
-    float out1[98];
-    float out2[65];
+    __shared__ f32 shared_input[TENSOR_LENGTH];
+    f32 out1[98];
+    f32 out2[65];
 
     int num_threads = blockDim.x * gridDim.x;
     int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x);
 
-    float* input = (float*)&d_inputs[in_num * TENSOR_LENGTH];
+    f32* input = (f32*)&d_inputs[in_num * TENSOR_LENGTH];
 
     if (threadIdx.x < TENSOR_LENGTH) {
         shared_input[threadIdx.x] = input[threadIdx.x];
@@ -228,7 +228,7 @@ int main(int argc, char* argv[]) {
     results = (int*)malloc((input_count) * sizeof(int));
     inputs = (f32*)malloc((input_count) * sizeof(f32) * TENSOR_LENGTH);
     cudaMalloc(&d_results, (input_count) * sizeof(int));
-    cudaMalloc(&d_inputs, (input_count) * sizeof(float) * TENSOR_LENGTH);
+    cudaMalloc(&d_inputs, (input_count) * sizeof(f32) * TENSOR_LENGTH);
     
     // Read and process inputs
     char* file_name = (char*)malloc((100) * sizeof(char));
@@ -253,7 +253,7 @@ int main(int argc, char* argv[]) {
     closedir(dir);
 
     // Move input array to GPU memory
-    cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_inputs, inputs, sizeof(f32) * 225 * input_count, cudaMemcpyHostToDevice);
 
 # ifdef USE_MPI
     int it_per_gpu = num_its / num_proccesses + (process_id < (num_its % num_proccesses) ? 1 : 0);
diff --git a/cudasrc/matrix.cu b/cudasrc/matrix.cu
index 044ee4b..43af503 100644
--- a/cudasrc/matrix.cu
+++ b/cudasrc/matrix.cu
@@ -9,11 +9,11 @@ __host__ __device__ matrix* new_matrix(int rows, int cols) {
     matrix* res = (matrix*)malloc(sizeof(matrix));
     res->rows = rows;
     res->cols = cols;
-    res->data = (float*)malloc((rows * cols) * sizeof(float));
+    res->data = (f32*)malloc((rows * cols) * sizeof(f32));
     return res;
 }
 
-__global__ void alloc(matrix* res, float* data, int rows, int cols) {
+__global__ void alloc(matrix* res, f32* data, int rows, int cols) {
     res->rows = rows;
     res->cols = cols;
     res->data = data;
@@ -22,8 +22,8 @@ __global__ void alloc(matrix* res, float* data, int rows, int cols) {
 matrix* new_matrix_d(int rows, int cols) {
     matrix* res;
     cudaMalloc(&res, sizeof(matrix));
-    float* data;
-    cudaMalloc(&data, rows * cols * sizeof(float));
+    f32* data;
+    cudaMalloc(&data, rows * cols * sizeof(f32));
     alloc<<<1, 1>>>(res, data, rows, cols);
     return res;
 }
@@ -31,9 +31,9 @@ matrix* new_matrix_d(int rows, int cols) {
 matrix* copy_to_device(matrix* h_mat) {
     matrix* res;
     cudaMalloc(&res, sizeof(matrix));
-    float* data;
-    cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float));
-    cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice);
+    f32* data;
+    cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(f32));
+    cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(f32), cudaMemcpyHostToDevice);
     alloc<<<1, 1>>>(res, data, h_mat->rows, h_mat->cols);
     return res;
 }
@@ -42,14 +42,14 @@ __device__ __host__ matrix* create_copy(matrix* mat) {
     matrix* res = (matrix*)malloc(sizeof(matrix));
     res->rows = mat->rows;
     res->cols = mat->cols;
-    res->data = (float*)malloc((res->rows * res->cols) * sizeof(float));
-    memcpy(res->data, mat->data, res->rows * res->cols * sizeof(float));
+    res->data = (f32*)malloc((res->rows * res->cols) * sizeof(f32));
+    memcpy(res->data, mat->data, res->rows * res->cols * sizeof(f32));
     return res;
 }
 
-__device__ void matrix_mul(float* weight, float* input, float* result, int w_rows, int w_cols) {
+__device__ void matrix_mul(f32* weight, f32* input, f32* result, int w_rows, int w_cols) {
     for (int i = 0; i < w_rows; i++) {
-        float sum = 0;
+        f32 sum = 0;
         int j = 0;
 
         for (; j <= w_cols - 4; j += 4) {
@@ -65,31 +65,31 @@ __device__ void matrix_mul(float* weight, float* input, float* result, int w_row
     }
 }
 
-__device__ void matrix_add(float* a, float* b, int rows) {
+__device__ void matrix_add(f32* a, f32* b, int rows) {
     for (int i = 0; i < rows; i++) {
         a[i] += b[i];
     }
 }
 
-__device__ void relu(float* a, int rows) {
+__device__ void relu(f32* a, int rows) {
     for (int i = 0; i < rows; i++) {
         a[i] = (a[i] > 0) ? a[i] : 0;
     }
 }
 
-__device__ void softmax(float* a, int rows) {
-    float sum = 0.0;
+__device__ void softmax(f32* a, int rows) {
+    f32 sum = 0.0;
     for (size_t i = 0; i < rows; i++) {
         sum += __expf(a[i]);
     }
-    float t = __logf(sum);
+    f32 t = __logf(sum);
     for (size_t i = 0; i < rows; i++) {
         a[i] = __expf(a[i] - t);
     }
 }
 
-__device__ int argmax(float* a, int rows) {
-    float res = a[0];
+__device__ int argmax(f32* a, int rows) {
+    f32 res = a[0];
     int idx = 0;
     for (int i = 0; i < rows; i++) {
         if (res < a[i]) {
diff --git a/cudasrc/matrix.cuh b/cudasrc/matrix.cuh
index 4c583c7..2c2ae7d 100644
--- a/cudasrc/matrix.cuh
+++ b/cudasrc/matrix.cuh
@@ -5,7 +5,7 @@ typedef float f32;
 typedef struct {
     int rows;
     int cols;
-    float* data; // array
+    f32* data; // array
 } matrix;
 
  __host__ __device__ matrix* new_matrix(int rows, int cols);
@@ -14,14 +14,14 @@ matrix* copy_to_device(matrix* h_mat);
 
 matrix* new_matrix_d(int rows, int cols);
 
-__device__ void matrix_mul(float* a, float* b, float* c, int rows, int cols);
+__device__ void matrix_mul(f32* a, f32* b, f32* c, int rows, int cols);
 
-__device__ void matrix_add(float* a, float* b, int rows);
+__device__ void matrix_add(f32* a, f32* b, int rows);
 
-__device__ void relu(float* a, int rows);
+__device__ void relu(f32* a, int rows);
 
-__device__ void softmax(float* a, int rows);
+__device__ void softmax(f32* a, int rows);
 
-__device__ int argmax(float* a, int rows);
+__device__ int argmax(f32* a, int rows);
 
 __device__ __host__ matrix* create_copy(matrix* mat);
diff --git a/src/file_io.c b/src/file_io.c
index 936f6a4..79b77ed 100644
--- a/src/file_io.c
+++ b/src/file_io.c
@@ -20,7 +20,7 @@ int file_count(const char* dir_path) {
 
 void process_weights_str(matrix** weights, char* line, int layer) {
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
 
     token = strtok(line, delimiter);
@@ -34,7 +34,7 @@ void process_weights_str(matrix** weights, char* line, int layer) {
 
 void process_biases_str(vector** biases, char* line, int layer) {
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
 
     token = strtok(line, delimiter);
@@ -78,9 +78,9 @@ void read_tensor(f32* a, const char* file_name) {
         perror("Could not read tensor file. Exiting.");
         exit(EXIT_FAILURE);
     } 
-    
+
     char* token;
-    float value;
+    f32 value;
     const char* delimiter = ",";
     token = strtok(line, delimiter);
 
diff --git a/src/main.c b/src/main.c
index 909586b..6f5184b 100644
--- a/src/main.c
+++ b/src/main.c
@@ -25,7 +25,7 @@ char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f',
                     'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r',
                     'S', 's', 'T', 't', 'U', 'u', 'V', 'v', 'W', 'w', 'X', 'x', 'Y', 'y', 'Z', 'z'};
 
-void propagate_fwd(const matrix* weights, const float* inputs, float* results, const vector* biases) {
+void propagate_fwd(const matrix* weights, const f32* inputs, f32* results, const vector* biases) {
     sgemv_t_tuned(weights->data, inputs, results, weights->cols, weights->rows);
     // Add biases onto results
     vector_add_inplace(biases->len, biases->data, results);
@@ -35,8 +35,8 @@ void propagate_fwd(const matrix* weights, const float* inputs, float* results, c
 // This code f***ing sucks but its fast so uhhhh
 u8 infer_reuse_layers_thread(vector* input, matrix** weights, vector** biases) {
     // Slightly larger than required for padding
-    float out0[104] __attribute__((aligned(SIMD_ALIGN))) = {0};
-    float out1[72] __attribute__((aligned(SIMD_ALIGN))) = {0};
+    f32 out0[104] __attribute__((aligned(SIMD_ALIGN))) = {0};
+    f32 out1[72] __attribute__((aligned(SIMD_ALIGN))) = {0};
 
     propagate_fwd(weights[0], input->data, out0, biases[0]);
     relu_inplace(out0, 98);
diff --git a/src/matrix.c b/src/matrix.c
index 39f818e..3d06924 100644
--- a/src/matrix.c
+++ b/src/matrix.c
@@ -39,7 +39,7 @@ vector* new_vec_aligned(int len) {
 }
 
 // ver. Artemis Rosman simd_intrin 2x8
-static void kernel(const float* in, const float* wg, float* rs, int start_row, int start_col, int w_width) {
+static void kernel(const f32* in, const f32* wg, f32* rs, int start_row, int start_col, int w_width) {
     // printf("Kernel at row %d col %d\n", start_row, start_col);
     __m256 res = _mm256_load_ps(&rs[start_col]);
 
@@ -54,7 +54,7 @@ static void kernel(const float* in, const float* wg, float* rs, int start_row, i
 // Ver. Artemis Rosman
 // W rows and W width is expected to be for the column major matrix, i.e. len of
 // in vec = w_rows, len of out vec = w_cols
-void sgemv_t_tuned(const float* weights, const float* inputs, float* __restrict__ results, int w_width, int w_rows) {
+void sgemv_t_tuned(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_width, int w_rows) {
     // Perform mult using kernel
     for (int row = 0; row < w_rows; row += KERN_ROWS) {
         for (int col = 0; col < w_width; col += KERN_COLS) {
@@ -77,15 +77,15 @@ void relu_inplace(f32* dest, int len) {
 }
 
 // Hacky but fast and accurate for existing inputs
-static inline float fastexp(float x) {
+static inline f32 fastexp(f32 x) {
     int tmp = (int)(1512775 * x + 1072632447);
-    float result;
+    f32 result;
     memcpy(&result, &tmp, sizeof(result));
     return result;
 }
 
 void softmax_inplace(f32* dest, int len) {
-    float res = 0.0f;
+    f32 res = 0.0f;
     for (int i = 0; i < len; i++) {
         res += fastexp(dest[i]);
     }
@@ -97,7 +97,7 @@ void softmax_inplace(f32* dest, int len) {
 // Get result from output layer
 u8 argmax(f32* in, int len) {
     int idx = 0;
-    float res = in[0];
+    f32 res = in[0];
     for (int i = 0; i < len; i++) {
         if (res < in[i]) {
             res = in[i];

From 12727bec7b64f47a3cbdfc2fa38574f86a7a617c Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 20:08:56 +1000
Subject: [PATCH 7/8] Fix formatting

---
 cudasrc/main.cu    | 9 ++++-----
 cudasrc/matrix.cuh | 2 +-
 src/file_io.c      | 2 +-
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cudasrc/main.cu b/cudasrc/main.cu
index 2f17a96..549f39a 100644
--- a/cudasrc/main.cu
+++ b/cudasrc/main.cu
@@ -229,7 +229,7 @@ int main(int argc, char* argv[]) {
     inputs = (f32*)malloc((input_count) * sizeof(f32) * TENSOR_LENGTH);
     cudaMalloc(&d_results, (input_count) * sizeof(int));
     cudaMalloc(&d_inputs, (input_count) * sizeof(f32) * TENSOR_LENGTH);
-    
+
     // Read and process inputs
     char* file_name = (char*)malloc((100) * sizeof(char));
     char* file_num_str = (char*)malloc((100) * sizeof(char));
@@ -255,9 +255,9 @@ int main(int argc, char* argv[]) {
     // Move input array to GPU memory
     cudaMemcpy(d_inputs, inputs, sizeof(f32) * 225 * input_count, cudaMemcpyHostToDevice);
 
-# ifdef USE_MPI
+#ifdef USE_MPI
     int it_per_gpu = num_its / num_proccesses + (process_id < (num_its % num_proccesses) ? 1 : 0);
-#else 
+#else
     int it_per_gpu = num_its;
 #endif
 
@@ -293,8 +293,7 @@ int main(int argc, char* argv[]) {
 
     // Time taken
     gettimeofday(&stop, NULL);
-    printf("Total: %lu us\n",
-           (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
+    printf("Total: %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
 
 #ifdef USE_MPI
     MPI_Finalize();
diff --git a/cudasrc/matrix.cuh b/cudasrc/matrix.cuh
index 2c2ae7d..e7886f9 100644
--- a/cudasrc/matrix.cuh
+++ b/cudasrc/matrix.cuh
@@ -8,7 +8,7 @@ typedef struct {
     f32* data; // array
 } matrix;
 
- __host__ __device__ matrix* new_matrix(int rows, int cols);
+__host__ __device__ matrix* new_matrix(int rows, int cols);
 
 matrix* copy_to_device(matrix* h_mat);
 
diff --git a/src/file_io.c b/src/file_io.c
index 79b77ed..5e422c0 100644
--- a/src/file_io.c
+++ b/src/file_io.c
@@ -77,7 +77,7 @@ void read_tensor(f32* a, const char* file_name) {
     if (getline(&line, &len, file) == -1) {
         perror("Could not read tensor file. Exiting.");
         exit(EXIT_FAILURE);
-    } 
+    }
 
     char* token;
     f32 value;

From e716eb59774e2cf764ff55fb6db4bf866c852bcd Mon Sep 17 00:00:00 2001
From: Artemis Rosman <73006620+rozukke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 20:11:35 +1000
Subject: [PATCH 8/8] Fix CI failing (I think)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ead262..04734f4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.16)
 
-project(ichida-algo LANGUAGES C CUDA)
+project(ichida-algo LANGUAGES C)
 
 set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra -Wpedantic")
 set(CMAKE_C_STANDARD 11)