From 27e3fe702769f350d62fe5669399e8901d949da6 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Mon, 8 Jul 2024 19:57:15 +1000 Subject: [PATCH 1/8] Update CMake for conditional compilation --- CMakeLists.txt | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index faea7b0..0ead262 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,34 +1,42 @@ cmake_minimum_required(VERSION 3.16) -project(ichida-algo LANGUAGES C CXX) +project(ichida-algo LANGUAGES C CUDA) -set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra") +set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra -Wpedantic") set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED True) -set(CMAKE_VERBOSE_MAKEFILE ON) -set(INC_DIR include) -set(SRC_DIR src) +set(SOURCE_DIR src) set(CUDA_SRC_DIR cudasrc) -include_directories(${INC_DIR}) - -file(GLOB_RECURSE SOURCE_FILES ${SRC_DIR}/*.c) +file(GLOB SOURCE_FILES ${SOURCE_DIR}/*.c) add_executable(speed_cpu ${SOURCE_FILES}) target_link_libraries(speed_cpu m pthread gomp) find_package(CUDA) - if(CUDA_FOUND) enable_language(CUDA) + set(CMAKE_CUDA_ARCHITECTURES "80") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80") - find_package(MPI REQUIRED) - include_directories(${MPI_INCLUDE_PATH}) file(GLOB_RECURSE CUDA_SOURCE_FILES ${CUDA_SRC_DIR}/*.cu) add_executable(speed_gpu ${CUDA_SOURCE_FILES}) set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - target_link_libraries(speed_gpu m ${MPI_LIBRARIES}) + + if(COMPILE_MPI) + find_package(MPI) + if(MPI_FOUND) + include_directories(${MPI_INCLUDE_PATH}) + target_link_libraries(speed_gpu m ${MPI_LIBRARIES}) + set_property(TARGET speed_gpu PROPERTY COMPILE_DEFINITIONS USE_MPI) + else() + message(STATUS "MPI not found. Please install library to compile with MPI enabled.") + endif(MPI_FOUND) + + else() + target_link_libraries(speed_gpu m) + endif() + else() message(STATUS "CUDA not found, only CPU version will be built.") endif() From 7ea1a82825354103f10d4bc671f5345955418262 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Mon, 8 Jul 2024 19:57:31 +1000 Subject: [PATCH 2/8] Update Makefile for conditional compilation --- Makefile | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 0253aee..b82c40b 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ -.PHONY: all clean build run_cpu run_gpu test_cpu test_gpu bench stat +.PHONY: all clean build build_mpi build_gpu test_cpu test_gpu bench stat # Default iterations -iterations ?= 1000 +iterations ?= 100000 all: build @@ -11,26 +11,34 @@ clean: rm -rf build rm -f speed_cpu speed_gpu -build: clean +build_gpu: clean cmake -S . -B build -DCMAKE_BUILD_TYPE=Release $(MAKE) -C build cp -u build/speed_cpu ./ if [ -f build/speed_gpu ]; then cp -u build/speed_gpu ./; fi -run_cpu: build +build_mpi: clean + cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCOMPILE_MPI=True + $(MAKE) -C build + cp -u build/speed_cpu ./ + if [ -f build/speed_gpu ]; then cp -u build/speed_gpu ./; fi + +build: build_mpi + +run_cpu: ./speed_cpu ./weights_and_biases.txt ./tensors $(iterations) -run_gpu: build +run_gpu: + ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations) + +run_mpi: n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \ mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations) -test_cpu: build - ./speed_cpu ./weights_and_biases.txt ./tensors $(iterations) +test_cpu: build run_cpu mv ./results.csv ./test python3 ./test/verify_csv.py -test_gpu: build - n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \ - mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors $(iterations) +test_gpu: build run_mpi mv ./results.csv ./test python3 ./test/verify_csv.py \ No newline at end of file From 5e4f509b7e55fbd233e36a58cfe38ee97a4a45f7 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Mon, 8 Jul 2024 19:57:58 +1000 Subject: [PATCH 3/8] CUDA general quality pass --- cudasrc/main.cu | 168 ++++++++++++++++++++++++++-------------------- cudasrc/matrix.cu | 13 +--- 2 files changed, 99 insertions(+), 82 deletions(-) diff --git a/cudasrc/main.cu b/cudasrc/main.cu index 732c948..516452c 100644 --- a/cudasrc/main.cu +++ b/cudasrc/main.cu @@ -1,32 +1,28 @@ #include "matrix.cuh" #include -#include #include #include #include #include +#ifdef USE_MPI +#include +#endif #define NUM_LAYERS 7 +#define TENSOR_LENGTH 225 -#define CUDA_CHECK(call) \ - do { \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - fprintf(stderr, "CUDA error in %s (%s:%d): %s\n", __func__, __FILE__, __LINE__, cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) +#define BLOCKS 108 +#define THREADS_PER_BLOCK 1024 matrix* weights[NUM_LAYERS]; matrix* biases[NUM_LAYERS]; +f32* inputs; +int* results; -// device weights and biases; +// Device memory matrix** d_weights; matrix** d_biases; - -float* inputs; -float* d_inputs; -int* results; +f32* d_inputs; int* d_results; char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i', @@ -84,12 +80,16 @@ void read_model(const char* file_name) { fclose(file); } -void read_tensor(float* a, const char* fileName) { +void read_tensor(float* out, const char* fileName) { FILE* file = fopen(fileName, "r"); char* line = NULL; size_t len = 0; - getline(&line, &len, file); + if (getline(&line, &len, file) == -1) { + perror("Could not read tensor file.\n"); + exit(EXIT_FAILURE); + } + char* token; float value; const char* delimiter = ","; @@ -97,40 +97,51 @@ void read_tensor(float* a, const char* fileName) { for (int i = 0; i < 225; i++) { value = strtof(token, NULL); - a[i] = value; + out[i] = value; token = strtok(NULL, delimiter); } free(line); fclose(file); } -__device__ void propagate_fwd(matrix* weights, float* input_layer, float* output_layer, matrix* biases) { +int file_count(const char* dir_path) { + struct dirent* entry; + DIR* dir = opendir(dir_path); + + // Count inputs + int num_inputs = 0; + while ((entry = readdir(dir)) != NULL) { + if (entry->d_type == DT_REG) + num_inputs++; + } + + return num_inputs; +} + +__device__ void propagate_fwd(matrix* weights, f32* input_layer, f32* output_layer, matrix* biases) { matrix_mul(weights->data, input_layer, output_layer, weights->rows, weights->cols); matrix_add(output_layer, biases->data, biases->rows); } -#define BLOCKS 108 -#define THREADS_PER_BLOCK 1024 - __global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input, int in_num) { - __shared__ float sharedInput[225]; + __shared__ float shared_input[TENSOR_LENGTH]; float out1[98]; float out2[65]; int num_threads = blockDim.x * gridDim.x; int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x); - float* input = (float*)&d_inputs[in_num * 225]; + float* input = (float*)&d_inputs[in_num * TENSOR_LENGTH]; - if (threadIdx.x < 225) { - sharedInput[threadIdx.x] = input[threadIdx.x]; + if (threadIdx.x < TENSOR_LENGTH) { + shared_input[threadIdx.x] = input[threadIdx.x]; } __syncthreads(); for (int i = thread_idx; i < it_per_input; i += num_threads) { - propagate_fwd(d_weights[0], sharedInput, out1, d_biases[0]); + propagate_fwd(d_weights[0], shared_input, out1, d_biases[0]); relu(out1, 98); propagate_fwd(d_weights[1], out1, out2, d_biases[1]); @@ -154,23 +165,30 @@ __global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matri d_results[in_num] = argmax(out1, 52); } } + int main(int argc, char* argv[]) { - MPI_Init(&argc, &argv); - int totalProcess, processId; - MPI_Comm_size(MPI_COMM_WORLD, &totalProcess); // size - MPI_Comm_rank(MPI_COMM_WORLD, &processId); // gpuid if (argc < 4) { printf("Not enough arguments. Usage: speed_cpu \n"); +#ifdef USE_MPI MPI_Finalize(); +#endif return EXIT_FAILURE; } - // get no of gpu - int deviceCount; - cudaGetDeviceCount(&deviceCount); - int deviceId = processId % deviceCount; - cudaSetDevice(deviceId); +#ifdef USE_MPI + // Initialise GPU environment + MPI_Init(&argc, &argv); + int num_proccesses, process_id; + MPI_Comm_size(MPI_COMM_WORLD, &num_proccesses); + MPI_Comm_rank(MPI_COMM_WORLD, &process_id); + + int device_count; + cudaGetDeviceCount(&device_count); + int device_id = process_id % device_count; + cudaSetDevice(device_id); + printf("MPI device id: %d\n", device_id); +#endif // Start timing struct timeval stop, start; @@ -193,85 +211,93 @@ int main(int argc, char* argv[]) { biases[6] = new_matrix(52, 1); read_model(argv[1]); - CUDA_CHECK(cudaMalloc(&d_weights, NUM_LAYERS * sizeof(matrix*))); - CUDA_CHECK(cudaMalloc(&d_biases, NUM_LAYERS * sizeof(matrix*))); + // Copy model to GPU + cudaMalloc(&d_weights, NUM_LAYERS * sizeof(matrix*)); + cudaMalloc(&d_biases, NUM_LAYERS * sizeof(matrix*)); for (int i = 0; i < NUM_LAYERS; i++) { - matrix* a = copy_to_device(weights[i]); - matrix* b = copy_to_device(biases[i]); - CUDA_CHECK(cudaMemcpy(&(d_weights[i]), &a, sizeof(matrix*), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(&(d_biases[i]), &b, sizeof(matrix*), cudaMemcpyHostToDevice)); + matrix* layer_weight = copy_to_device(weights[i]); + matrix* layer_bias = copy_to_device(biases[i]); + cudaMemcpy(&(d_weights[i]), &layer_weight, sizeof(matrix*), cudaMemcpyHostToDevice); + cudaMemcpy(&(d_biases[i]), &layer_bias, sizeof(matrix*), cudaMemcpyHostToDevice); } const char* directory_path = argv[2]; - struct dirent* entry; - DIR* dir = opendir(directory_path); + int input_count = file_count(directory_path); + int num_its = atoi(argv[3]); + results = (int*)malloc((input_count) * sizeof(int)); + inputs = (f32*)malloc((input_count) * sizeof(f32) * TENSOR_LENGTH); + cudaMalloc(&d_results, (input_count) * sizeof(int)); + cudaMalloc(&d_inputs, (input_count) * sizeof(float) * TENSOR_LENGTH); + // Read and process inputs char* file_name = (char*)malloc((100) * sizeof(char)); char* file_num_str = (char*)malloc((100) * sizeof(char)); - int file_num; - int input_count = 0; - while ((entry = readdir(dir)) != NULL) { - if (entry->d_type == DT_REG) { - input_count++; - } - } - - results = (int*)malloc((input_count) * sizeof(int)); - inputs = (float*)malloc((input_count) * sizeof(float) * 225); - - cudaMalloc(&d_results, (input_count) * sizeof(int)); - cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225); - + struct dirent* entry; + DIR* dir = opendir(directory_path); dir = opendir(directory_path); while ((entry = readdir(dir)) != NULL) { if (entry->d_type == DT_REG) { strcpy(file_num_str, entry->d_name); file_num_str[strlen(entry->d_name) - 7] = '\0'; - file_num = atoi(entry->d_name); + int file_num = atoi(entry->d_name); strcpy(file_name, directory_path); strcat(file_name, "/"); strcat(file_name, entry->d_name); - read_tensor((float*)&inputs[(file_num - 1) * 225], file_name); + read_tensor((f32*)&inputs[(file_num - 1) * 225], file_name); } } - free(file_name); free(file_num_str); closedir(dir); + // Move input array to GPU memory cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice); - int it_num = atoi(argv[3]); - int gpu_it_num = it_num / totalProcess + (processId < (it_num % totalProcess) ? 1 : 0); +# ifdef USE_MPI + int it_per_gpu = num_its / num_proccesses + (process_id < (num_its % num_proccesses) ? 1 : 0); +#else + int it_per_gpu = num_its; +#endif - struct timeval stop1, start1; - gettimeofday(&start1, NULL); + struct timeval stop_inf, start_inf; + gettimeofday(&start_inf, NULL); cudaDeviceSynchronize(); for (int i = 0; i < input_count; i++) { - infer<<>>(d_inputs, d_results, d_weights, d_biases, gpu_it_num, i); - CUDA_CHECK(cudaGetLastError()); + infer<<>>(d_inputs, d_results, d_weights, d_biases, it_per_gpu, i); } cudaDeviceSynchronize(); - if (processId == 0) { +#ifdef USE_MPI + if (process_id == 0) { +#endif cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost); - gettimeofday(&stop1, NULL); - printf("Process %d - Inference: %lu us\n", processId, - (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec); + gettimeofday(&stop_inf, NULL); +#ifdef USE_MPI + printf("Process %d - Inference: %lu us\n", process_id, + (stop_inf.tv_sec - start_inf.tv_sec) * 1000000 + stop_inf.tv_usec - start_inf.tv_usec); +#endif + + // Print output to csv FILE* csv_file = fopen("results.csv", "w+"); fprintf(csv_file, "image_number, guess\n"); for (int i = 0; i < input_count; i++) { fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]); } fclose(csv_file); +#ifdef USE_MPI } +#endif + // Time taken gettimeofday(&stop, NULL); - printf("Process %d - Total: %lu us\n", processId, + printf("Total: %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); + +#ifdef USE_MPI MPI_Finalize(); +#endif return EXIT_SUCCESS; } diff --git a/cudasrc/matrix.cu b/cudasrc/matrix.cu index c223f6c..044ee4b 100644 --- a/cudasrc/matrix.cu +++ b/cudasrc/matrix.cu @@ -1,5 +1,4 @@ #include "matrix.cuh" -#include "util.cuh" #include #include #include @@ -22,7 +21,7 @@ __global__ void alloc(matrix* res, float* data, int rows, int cols) { matrix* new_matrix_d(int rows, int cols) { matrix* res; - CUDA_CHECK(cudaMalloc(&res, sizeof(matrix))); + cudaMalloc(&res, sizeof(matrix)); float* data; cudaMalloc(&data, rows * cols * sizeof(float)); alloc<<<1, 1>>>(res, data, rows, cols); @@ -31,7 +30,7 @@ matrix* new_matrix_d(int rows, int cols) { matrix* copy_to_device(matrix* h_mat) { matrix* res; - CUDA_CHECK(cudaMalloc(&res, sizeof(matrix))); + cudaMalloc(&res, sizeof(matrix)); float* data; cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float)); cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice); @@ -78,14 +77,6 @@ __device__ void relu(float* a, int rows) { } } -// Hacky but fast and accurate for existing inputs -static __device__ inline float fastexp(float x) { - int tmp = (int)(1512775 * x + 1072632447); - float result; - memcpy(&result, &tmp, sizeof(result)); - return result; -} - __device__ void softmax(float* a, int rows) { float sum = 0.0; for (size_t i = 0; i < rows; i++) { From cb2683b34d0e0fa44b7cf71edcd2f9f091a6b2ec Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Mon, 8 Jul 2024 19:58:21 +1000 Subject: [PATCH 4/8] CUDA headers general quality pass --- cudasrc/matrix.cuh | 2 ++ cudasrc/util.cuh | 8 -------- 2 files changed, 2 insertions(+), 8 deletions(-) delete mode 100644 cudasrc/util.cuh diff --git a/cudasrc/matrix.cuh b/cudasrc/matrix.cuh index b2191cb..4c583c7 100644 --- a/cudasrc/matrix.cuh +++ b/cudasrc/matrix.cuh @@ -1,5 +1,7 @@ #pragma once +typedef float f32; + typedef struct { int rows; int cols; diff --git a/cudasrc/util.cuh b/cudasrc/util.cuh deleted file mode 100644 index a6f988a..0000000 --- a/cudasrc/util.cuh +++ /dev/null @@ -1,8 +0,0 @@ -#pragma once - -#define CUDA_CHECK(call) \ - cudaError_t err = call; \ - if (err != cudaSuccess) { \ - fprintf(stderr, "CUDA error in %s (%s:%d): %s\n", __func__, __FILE__, __LINE__, cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ No newline at end of file From bae32dc0eac1a30e3d4d2dfa720cca6225604707 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Mon, 8 Jul 2024 19:58:35 +1000 Subject: [PATCH 5/8] Prevent warning when compiling --- src/file_io.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/file_io.c b/src/file_io.c index 0b9847f..936f6a4 100644 --- a/src/file_io.c +++ b/src/file_io.c @@ -74,7 +74,11 @@ void read_tensor(f32* a, const char* file_name) { char* line = NULL; size_t len = 0; - getline(&line, &len, file); + if (getline(&line, &len, file) == -1) { + perror("Could not read tensor file. Exiting."); + exit(EXIT_FAILURE); + } + char* token; float value; const char* delimiter = ","; From 327d37ea9e988669cb0b76f60dd6840edc12a1e2 Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Mon, 8 Jul 2024 20:01:50 +1000 Subject: [PATCH 6/8] Type naming consistency --- cudasrc/main.cu | 22 +++++++++++----------- cudasrc/matrix.cu | 36 ++++++++++++++++++------------------ cudasrc/matrix.cuh | 12 ++++++------ src/file_io.c | 8 ++++---- src/main.c | 6 +++--- src/matrix.c | 12 ++++++------ 6 files changed, 48 insertions(+), 48 deletions(-) diff --git a/cudasrc/main.cu b/cudasrc/main.cu index 516452c..2f17a96 100644 --- a/cudasrc/main.cu +++ b/cudasrc/main.cu @@ -31,7 +31,7 @@ char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', void process_weights_str(char* line, int layer) { char* token; - float value; + f32 value; const char* delimiter = ","; token = strtok(line, delimiter); @@ -45,7 +45,7 @@ void process_weights_str(char* line, int layer) { void process_biases_str(char* line, int layer) { char* token; - float value; + f32 value; const char* delimiter = ","; token = strtok(line, delimiter); @@ -80,7 +80,7 @@ void read_model(const char* file_name) { fclose(file); } -void read_tensor(float* out, const char* fileName) { +void read_tensor(f32* out, const char* fileName) { FILE* file = fopen(fileName, "r"); char* line = NULL; size_t len = 0; @@ -91,7 +91,7 @@ void read_tensor(float* out, const char* fileName) { } char* token; - float value; + f32 value; const char* delimiter = ","; token = strtok(line, delimiter); @@ -123,17 +123,17 @@ __device__ void propagate_fwd(matrix* weights, f32* input_layer, f32* output_lay matrix_add(output_layer, biases->data, biases->rows); } -__global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input, +__global__ void infer(f32* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input, int in_num) { - __shared__ float shared_input[TENSOR_LENGTH]; - float out1[98]; - float out2[65]; + __shared__ f32 shared_input[TENSOR_LENGTH]; + f32 out1[98]; + f32 out2[65]; int num_threads = blockDim.x * gridDim.x; int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x); - float* input = (float*)&d_inputs[in_num * TENSOR_LENGTH]; + f32* input = (f32*)&d_inputs[in_num * TENSOR_LENGTH]; if (threadIdx.x < TENSOR_LENGTH) { shared_input[threadIdx.x] = input[threadIdx.x]; @@ -228,7 +228,7 @@ int main(int argc, char* argv[]) { results = (int*)malloc((input_count) * sizeof(int)); inputs = (f32*)malloc((input_count) * sizeof(f32) * TENSOR_LENGTH); cudaMalloc(&d_results, (input_count) * sizeof(int)); - cudaMalloc(&d_inputs, (input_count) * sizeof(float) * TENSOR_LENGTH); + cudaMalloc(&d_inputs, (input_count) * sizeof(f32) * TENSOR_LENGTH); // Read and process inputs char* file_name = (char*)malloc((100) * sizeof(char)); @@ -253,7 +253,7 @@ int main(int argc, char* argv[]) { closedir(dir); // Move input array to GPU memory - cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice); + cudaMemcpy(d_inputs, inputs, sizeof(f32) * 225 * input_count, cudaMemcpyHostToDevice); # ifdef USE_MPI int it_per_gpu = num_its / num_proccesses + (process_id < (num_its % num_proccesses) ? 1 : 0); diff --git a/cudasrc/matrix.cu b/cudasrc/matrix.cu index 044ee4b..43af503 100644 --- a/cudasrc/matrix.cu +++ b/cudasrc/matrix.cu @@ -9,11 +9,11 @@ __host__ __device__ matrix* new_matrix(int rows, int cols) { matrix* res = (matrix*)malloc(sizeof(matrix)); res->rows = rows; res->cols = cols; - res->data = (float*)malloc((rows * cols) * sizeof(float)); + res->data = (f32*)malloc((rows * cols) * sizeof(f32)); return res; } -__global__ void alloc(matrix* res, float* data, int rows, int cols) { +__global__ void alloc(matrix* res, f32* data, int rows, int cols) { res->rows = rows; res->cols = cols; res->data = data; @@ -22,8 +22,8 @@ __global__ void alloc(matrix* res, float* data, int rows, int cols) { matrix* new_matrix_d(int rows, int cols) { matrix* res; cudaMalloc(&res, sizeof(matrix)); - float* data; - cudaMalloc(&data, rows * cols * sizeof(float)); + f32* data; + cudaMalloc(&data, rows * cols * sizeof(f32)); alloc<<<1, 1>>>(res, data, rows, cols); return res; } @@ -31,9 +31,9 @@ matrix* new_matrix_d(int rows, int cols) { matrix* copy_to_device(matrix* h_mat) { matrix* res; cudaMalloc(&res, sizeof(matrix)); - float* data; - cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float)); - cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice); + f32* data; + cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(f32)); + cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(f32), cudaMemcpyHostToDevice); alloc<<<1, 1>>>(res, data, h_mat->rows, h_mat->cols); return res; } @@ -42,14 +42,14 @@ __device__ __host__ matrix* create_copy(matrix* mat) { matrix* res = (matrix*)malloc(sizeof(matrix)); res->rows = mat->rows; res->cols = mat->cols; - res->data = (float*)malloc((res->rows * res->cols) * sizeof(float)); - memcpy(res->data, mat->data, res->rows * res->cols * sizeof(float)); + res->data = (f32*)malloc((res->rows * res->cols) * sizeof(f32)); + memcpy(res->data, mat->data, res->rows * res->cols * sizeof(f32)); return res; } -__device__ void matrix_mul(float* weight, float* input, float* result, int w_rows, int w_cols) { +__device__ void matrix_mul(f32* weight, f32* input, f32* result, int w_rows, int w_cols) { for (int i = 0; i < w_rows; i++) { - float sum = 0; + f32 sum = 0; int j = 0; for (; j <= w_cols - 4; j += 4) { @@ -65,31 +65,31 @@ __device__ void matrix_mul(float* weight, float* input, float* result, int w_row } } -__device__ void matrix_add(float* a, float* b, int rows) { +__device__ void matrix_add(f32* a, f32* b, int rows) { for (int i = 0; i < rows; i++) { a[i] += b[i]; } } -__device__ void relu(float* a, int rows) { +__device__ void relu(f32* a, int rows) { for (int i = 0; i < rows; i++) { a[i] = (a[i] > 0) ? a[i] : 0; } } -__device__ void softmax(float* a, int rows) { - float sum = 0.0; +__device__ void softmax(f32* a, int rows) { + f32 sum = 0.0; for (size_t i = 0; i < rows; i++) { sum += __expf(a[i]); } - float t = __logf(sum); + f32 t = __logf(sum); for (size_t i = 0; i < rows; i++) { a[i] = __expf(a[i] - t); } } -__device__ int argmax(float* a, int rows) { - float res = a[0]; +__device__ int argmax(f32* a, int rows) { + f32 res = a[0]; int idx = 0; for (int i = 0; i < rows; i++) { if (res < a[i]) { diff --git a/cudasrc/matrix.cuh b/cudasrc/matrix.cuh index 4c583c7..2c2ae7d 100644 --- a/cudasrc/matrix.cuh +++ b/cudasrc/matrix.cuh @@ -5,7 +5,7 @@ typedef float f32; typedef struct { int rows; int cols; - float* data; // array + f32* data; // array } matrix; __host__ __device__ matrix* new_matrix(int rows, int cols); @@ -14,14 +14,14 @@ matrix* copy_to_device(matrix* h_mat); matrix* new_matrix_d(int rows, int cols); -__device__ void matrix_mul(float* a, float* b, float* c, int rows, int cols); +__device__ void matrix_mul(f32* a, f32* b, f32* c, int rows, int cols); -__device__ void matrix_add(float* a, float* b, int rows); +__device__ void matrix_add(f32* a, f32* b, int rows); -__device__ void relu(float* a, int rows); +__device__ void relu(f32* a, int rows); -__device__ void softmax(float* a, int rows); +__device__ void softmax(f32* a, int rows); -__device__ int argmax(float* a, int rows); +__device__ int argmax(f32* a, int rows); __device__ __host__ matrix* create_copy(matrix* mat); diff --git a/src/file_io.c b/src/file_io.c index 936f6a4..79b77ed 100644 --- a/src/file_io.c +++ b/src/file_io.c @@ -20,7 +20,7 @@ int file_count(const char* dir_path) { void process_weights_str(matrix** weights, char* line, int layer) { char* token; - float value; + f32 value; const char* delimiter = ","; token = strtok(line, delimiter); @@ -34,7 +34,7 @@ void process_weights_str(matrix** weights, char* line, int layer) { void process_biases_str(vector** biases, char* line, int layer) { char* token; - float value; + f32 value; const char* delimiter = ","; token = strtok(line, delimiter); @@ -78,9 +78,9 @@ void read_tensor(f32* a, const char* file_name) { perror("Could not read tensor file. Exiting."); exit(EXIT_FAILURE); } - + char* token; - float value; + f32 value; const char* delimiter = ","; token = strtok(line, delimiter); diff --git a/src/main.c b/src/main.c index 909586b..6f5184b 100644 --- a/src/main.c +++ b/src/main.c @@ -25,7 +25,7 @@ char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'Q', 'q', 'R', 'r', 'S', 's', 'T', 't', 'U', 'u', 'V', 'v', 'W', 'w', 'X', 'x', 'Y', 'y', 'Z', 'z'}; -void propagate_fwd(const matrix* weights, const float* inputs, float* results, const vector* biases) { +void propagate_fwd(const matrix* weights, const f32* inputs, f32* results, const vector* biases) { sgemv_t_tuned(weights->data, inputs, results, weights->cols, weights->rows); // Add biases onto results vector_add_inplace(biases->len, biases->data, results); @@ -35,8 +35,8 @@ void propagate_fwd(const matrix* weights, const float* inputs, float* results, c // This code f***ing sucks but its fast so uhhhh u8 infer_reuse_layers_thread(vector* input, matrix** weights, vector** biases) { // Slightly larger than required for padding - float out0[104] __attribute__((aligned(SIMD_ALIGN))) = {0}; - float out1[72] __attribute__((aligned(SIMD_ALIGN))) = {0}; + f32 out0[104] __attribute__((aligned(SIMD_ALIGN))) = {0}; + f32 out1[72] __attribute__((aligned(SIMD_ALIGN))) = {0}; propagate_fwd(weights[0], input->data, out0, biases[0]); relu_inplace(out0, 98); diff --git a/src/matrix.c b/src/matrix.c index 39f818e..3d06924 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -39,7 +39,7 @@ vector* new_vec_aligned(int len) { } // ver. Artemis Rosman simd_intrin 2x8 -static void kernel(const float* in, const float* wg, float* rs, int start_row, int start_col, int w_width) { +static void kernel(const f32* in, const f32* wg, f32* rs, int start_row, int start_col, int w_width) { // printf("Kernel at row %d col %d\n", start_row, start_col); __m256 res = _mm256_load_ps(&rs[start_col]); @@ -54,7 +54,7 @@ static void kernel(const float* in, const float* wg, float* rs, int start_row, i // Ver. Artemis Rosman // W rows and W width is expected to be for the column major matrix, i.e. len of // in vec = w_rows, len of out vec = w_cols -void sgemv_t_tuned(const float* weights, const float* inputs, float* __restrict__ results, int w_width, int w_rows) { +void sgemv_t_tuned(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_width, int w_rows) { // Perform mult using kernel for (int row = 0; row < w_rows; row += KERN_ROWS) { for (int col = 0; col < w_width; col += KERN_COLS) { @@ -77,15 +77,15 @@ void relu_inplace(f32* dest, int len) { } // Hacky but fast and accurate for existing inputs -static inline float fastexp(float x) { +static inline f32 fastexp(f32 x) { int tmp = (int)(1512775 * x + 1072632447); - float result; + f32 result; memcpy(&result, &tmp, sizeof(result)); return result; } void softmax_inplace(f32* dest, int len) { - float res = 0.0f; + f32 res = 0.0f; for (int i = 0; i < len; i++) { res += fastexp(dest[i]); } @@ -97,7 +97,7 @@ void softmax_inplace(f32* dest, int len) { // Get result from output layer u8 argmax(f32* in, int len) { int idx = 0; - float res = in[0]; + f32 res = in[0]; for (int i = 0; i < len; i++) { if (res < in[i]) { res = in[i]; From 12727bec7b64f47a3cbdfc2fa38574f86a7a617c Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Mon, 8 Jul 2024 20:08:56 +1000 Subject: [PATCH 7/8] Fix formatting --- cudasrc/main.cu | 9 ++++----- cudasrc/matrix.cuh | 2 +- src/file_io.c | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cudasrc/main.cu b/cudasrc/main.cu index 2f17a96..549f39a 100644 --- a/cudasrc/main.cu +++ b/cudasrc/main.cu @@ -229,7 +229,7 @@ int main(int argc, char* argv[]) { inputs = (f32*)malloc((input_count) * sizeof(f32) * TENSOR_LENGTH); cudaMalloc(&d_results, (input_count) * sizeof(int)); cudaMalloc(&d_inputs, (input_count) * sizeof(f32) * TENSOR_LENGTH); - + // Read and process inputs char* file_name = (char*)malloc((100) * sizeof(char)); char* file_num_str = (char*)malloc((100) * sizeof(char)); @@ -255,9 +255,9 @@ int main(int argc, char* argv[]) { // Move input array to GPU memory cudaMemcpy(d_inputs, inputs, sizeof(f32) * 225 * input_count, cudaMemcpyHostToDevice); -# ifdef USE_MPI +#ifdef USE_MPI int it_per_gpu = num_its / num_proccesses + (process_id < (num_its % num_proccesses) ? 1 : 0); -#else +#else int it_per_gpu = num_its; #endif @@ -293,8 +293,7 @@ int main(int argc, char* argv[]) { // Time taken gettimeofday(&stop, NULL); - printf("Total: %lu us\n", - (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); + printf("Total: %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); #ifdef USE_MPI MPI_Finalize(); diff --git a/cudasrc/matrix.cuh b/cudasrc/matrix.cuh index 2c2ae7d..e7886f9 100644 --- a/cudasrc/matrix.cuh +++ b/cudasrc/matrix.cuh @@ -8,7 +8,7 @@ typedef struct { f32* data; // array } matrix; - __host__ __device__ matrix* new_matrix(int rows, int cols); +__host__ __device__ matrix* new_matrix(int rows, int cols); matrix* copy_to_device(matrix* h_mat); diff --git a/src/file_io.c b/src/file_io.c index 79b77ed..5e422c0 100644 --- a/src/file_io.c +++ b/src/file_io.c @@ -77,7 +77,7 @@ void read_tensor(f32* a, const char* file_name) { if (getline(&line, &len, file) == -1) { perror("Could not read tensor file. Exiting."); exit(EXIT_FAILURE); - } + } char* token; f32 value; From e716eb59774e2cf764ff55fb6db4bf866c852bcd Mon Sep 17 00:00:00 2001 From: Artemis Rosman <73006620+rozukke@users.noreply.github.com> Date: Mon, 8 Jul 2024 20:11:35 +1000 Subject: [PATCH 8/8] Fix CI failing (I think) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ead262..04734f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.16) -project(ichida-algo LANGUAGES C CUDA) +project(ichida-algo LANGUAGES C) set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -fopenmp -Wall -Wextra -Wpedantic") set(CMAKE_C_STANDARD 11)