diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a1ceb6..56f9d6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,12 +5,12 @@ project(ichida-algo LANGUAGES C CXX CUDA) # Set compiler flags set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -Wall -Wextra") -set(CMAKE_C_STANDARD 99) +set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED True) set(CMAKE_VERBOSE_MAKEFILE ON) # Ensure CUDA NVCC flags are set properly -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -arch=sm_75") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80") set(INC_DIR include) set(SRC_DIR src) @@ -23,4 +23,4 @@ file(GLOB_RECURSE CUDA_SOURCE_FILES ${SRC_DIR}/*.cu) # Create GPU executable add_executable(speed_gpu ${CUDA_SOURCE_FILES}) set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -target_link_libraries(speed_gpu m) \ No newline at end of file +target_link_libraries(speed_gpu m) diff --git a/Makefile b/Makefile index 06847d6..d0830e1 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,6 @@ run: build ./speed_gpu ./weights_and_biases.txt ./tensors 100000 test: build - ./speed_gpu ./weights_and_biases.txt ./tensors 100000 + ./speed_gpu ./weights_and_biases.txt ./tensors 1000000 mv ./results.csv ./test python3 ./test/verify_csv.py \ No newline at end of file diff --git a/src/main.cu b/src/main.cu index efd9a67..374a80a 100644 --- a/src/main.cu +++ b/src/main.cu @@ -23,8 +23,9 @@ matrix* biases[NUM_LAYERS]; // device weights and biases; matrix** d_weights; matrix** d_biases; -matrix** d_inputs; +float* inputs; +float* d_inputs; int* results; int* d_results; @@ -83,7 +84,7 @@ void read_model(const char* file_name) { fclose(file); } -void read_tensor(matrix* a, const char* fileName) { +void read_tensor(float* a, const char* fileName) { FILE* file = fopen(fileName, "r"); char* line = NULL; size_t len = 0; @@ -96,64 +97,64 @@ void read_tensor(matrix* a, const char* fileName) { for (int i = 0; i < 225; i++) { value = strtof(token, NULL); - (a->data)[i] = value; + a[i] = value; token = strtok(NULL, delimiter); } free(line); fclose(file); } -__device__ void propagate_fwd(matrix* weights, matrix* input_layer, matrix* output_layer, matrix* biases) { - matrix_mul(weights->data, input_layer->data, output_layer->data, weights->rows, weights->cols); - matrix_add(output_layer->data, biases->data, biases->rows); +__device__ void propagate_fwd(matrix* weights, float* input_layer, float* output_layer, matrix* biases) { + matrix_mul(weights->data, input_layer, output_layer, weights->rows, weights->cols); + matrix_add(output_layer, biases->data, biases->rows); } -__global__ void infer(matrix** d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input, +#define BLOCKS 108 +#define THREADS_PER_BLOCK 1024 + +__global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input, int in_num) { + + __shared__ float sharedInput[225]; + float out1[98]; + float out2[65]; + int num_threads = blockDim.x * gridDim.x; int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x); - if (thread_idx > it_per_input) return; + float* input = (float*)&d_inputs[in_num * 225]; - matrix* input = d_inputs[in_num]; - - matrix* outputs[2]; - outputs[0] = new_matrix(98, 1); - outputs[1] = new_matrix(65, 1); + if (threadIdx.x < 225) { + sharedInput[threadIdx.x] = input[threadIdx.x]; + } + __syncthreads(); for (int i = thread_idx; i < it_per_input; i += num_threads) { - propagate_fwd(d_weights[0], input, outputs[0], d_biases[0]); - relu(outputs[0]->data, 98); + propagate_fwd(d_weights[0], sharedInput, out1, d_biases[0]); + relu(out1, 98); - propagate_fwd(d_weights[1], outputs[0], outputs[1], d_biases[1]); - relu(outputs[1]->data, 65); + propagate_fwd(d_weights[1], out1, out2, d_biases[1]); + relu(out2, 65); - propagate_fwd(d_weights[2], outputs[1], outputs[0], d_biases[2]); - relu(outputs[0]->data, 50); + propagate_fwd(d_weights[2], out2, out1, d_biases[2]); + relu(out1, 50); - propagate_fwd(d_weights[3], outputs[0], outputs[1], d_biases[3]); - relu(outputs[1]->data, 30); + propagate_fwd(d_weights[3], out1, out2, d_biases[3]); + relu(out2, 30); - propagate_fwd(d_weights[4], outputs[1], outputs[0], d_biases[4]); - relu(outputs[0]->data, 25); + propagate_fwd(d_weights[4], out2, out1, d_biases[4]); + relu(out1, 25); - propagate_fwd(d_weights[5], outputs[0], outputs[1], d_biases[5]); - relu(outputs[1]->data, 40); + propagate_fwd(d_weights[5], out1, out2, d_biases[5]); + relu(out2, 40); - propagate_fwd(d_weights[6], outputs[1], outputs[0], d_biases[6]); - softmax(outputs[0]->data, 52); + propagate_fwd(d_weights[6], out2, out1, d_biases[6]); + softmax(out1, 52); - int res = argmax(outputs[0]->data, 52); - d_results[in_num] = res; + d_results[in_num] = argmax(out1, 52); } - free(outputs[0]->data); - free(outputs[0]); - free(outputs[1]->data); - free(outputs[1]); } -#define IT_PER_IN 1000000 - int main(int argc, char* argv[]) { if (argc < 4) { printf("Not enough arguments. Usage: speed_cpu \n"); @@ -186,8 +187,7 @@ int main(int argc, char* argv[]) { for (int i = 0; i < NUM_LAYERS; i++) { matrix* a = copy_to_device(weights[i]); matrix* b = copy_to_device(biases[i]); - matrix** z = &(d_weights[i]); - CUDA_CHECK(cudaMemcpy(z, &a, sizeof(matrix*), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(&(d_weights[i]), &a, sizeof(matrix*), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(&(d_biases[i]), &b, sizeof(matrix*), cudaMemcpyHostToDevice)); } @@ -208,39 +208,82 @@ int main(int argc, char* argv[]) { } results = (int*)malloc((input_count) * sizeof(int)); - memset(results, 0, sizeof(int) * (input_count)); + inputs = (float*)malloc((input_count) * sizeof(float) * 225); + cudaMalloc(&d_results, (input_count) * sizeof(int)); - cudaMalloc(&d_inputs, (input_count) * sizeof(matrix*)); + cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225); dir = opendir(directory_path); while ((entry = readdir(dir)) != NULL) { if (entry->d_type == DT_REG) { - matrix* input = new_matrix(225, 1); strcpy(file_num_str, entry->d_name); file_num_str[strlen(entry->d_name) - 7] = '\0'; file_num = atoi(entry->d_name); strcpy(file_name, directory_path); strcat(file_name, "/"); strcat(file_name, entry->d_name); - read_tensor(input, file_name); - matrix* temp = copy_to_device(input); - cudaMemcpy(&d_inputs[file_num - 1], &temp, sizeof(matrix*), cudaMemcpyHostToDevice); - free(input); + read_tensor((float*)&inputs[(file_num - 1) * 225], file_name); } } + free(file_name); free(file_num_str); closedir(dir); - cudaMemset(d_results, 0, sizeof(int) * input_count); + cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice); + + int deviceCount; + cudaError_t err = cudaGetDeviceCount(&deviceCount); + if (err != cudaSuccess) { + printf("Error: %s\n", cudaGetErrorString(err)); + return -1; + } + + for (int i = 0; i < deviceCount; ++i) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + printf("Device %d:\n", i); + printf(" Device Name: %s\n", prop.name); + printf(" Compute Capability: %d.%d\n", prop.major, prop.minor); + printf(" Total Global Memory: %lu bytes\n", prop.totalGlobalMem); + printf(" Shared Memory per Block: %lu bytes\n", prop.sharedMemPerBlock); + printf(" Registers per Block: %d\n", prop.regsPerBlock); + printf(" Warp Size: %d\n", prop.warpSize); + printf(" Max Threads per Block: %d\n", prop.maxThreadsPerBlock); + printf(" Max threads per SM: %d\n", prop.maxThreadsPerMultiProcessor); + printf(" Max Threads Dim: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], + prop.maxThreadsDim[2]); + printf(" Max Grid Size: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); + printf(" Clock Rate: %d kHz\n", prop.clockRate); + printf(" Total Constant Memory: %lu bytes\n", prop.totalConstMem); + printf(" Multiprocessor Count: %d\n", prop.multiProcessorCount); + printf(" Memory Clock Rate: %d kHz\n", prop.memoryClockRate); + printf(" Memory Bus Width: %d bits\n", prop.memoryBusWidth); + printf(" L2 Cache Size: %d bytes\n", prop.l2CacheSize); + printf("\n"); + } + + int minGridSize, blockSize; + cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, infer, 0, 0); + printf("Recommended block size: %d Grid size: %d\n", blockSize, minGridSize); + + int it_num = atoi(argv[3]); + struct timeval stop1, start1; + gettimeofday(&start1, NULL); - int iter_per_in = atoi(argv[3]); + cudaDeviceSynchronize(); for (int i = 0; i < input_count; i++) { - infer<<<108, 69>>>(d_inputs, d_results, d_weights, d_biases, iter_per_in, i); + infer<<>>(d_inputs, d_results, d_weights, d_biases, it_num, i); + err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("CUDA error: %s\n", cudaGetErrorString(err)); + } } - cudaDeviceSynchronize(); + cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost); + gettimeofday(&stop1, NULL); + printf("- Inference: %lu us\n", (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec); FILE* csv_file = fopen("results.csv", "w+"); fprintf(csv_file, "image_number, guess\n"); @@ -251,7 +294,7 @@ int main(int argc, char* argv[]) { // Time taken gettimeofday(&stop, NULL); - printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); + printf("- Total: %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); return EXIT_SUCCESS; -} +} \ No newline at end of file diff --git a/src/matrix.cu b/src/matrix.cu index c9e38e1..c223f6c 100644 --- a/src/matrix.cu +++ b/src/matrix.cu @@ -26,7 +26,6 @@ matrix* new_matrix_d(int rows, int cols) { float* data; cudaMalloc(&data, rows * cols * sizeof(float)); alloc<<<1, 1>>>(res, data, rows, cols); - cudaDeviceSynchronize(); return res; } @@ -37,7 +36,6 @@ matrix* copy_to_device(matrix* h_mat) { cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float)); cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice); alloc<<<1, 1>>>(res, data, h_mat->rows, h_mat->cols); - cudaDeviceSynchronize(); return res; } @@ -89,17 +87,18 @@ static __device__ inline float fastexp(float x) { } __device__ void softmax(float* a, int rows) { - float res = (float)0; - for (int i = 0; i < rows; i++) { - res += exp(a[i]); + float sum = 0.0; + for (size_t i = 0; i < rows; i++) { + sum += __expf(a[i]); } - for (int i = 0; i < rows; i++) { - a[i] /= res; + float t = __logf(sum); + for (size_t i = 0; i < rows; i++) { + a[i] = __expf(a[i] - t); } } __device__ int argmax(float* a, int rows) { - int res = a[0]; + float res = a[0]; int idx = 0; for (int i = 0; i < rows; i++) { if (res < a[i]) {