From f1de8af32581493d488f745364c485902d564d45 Mon Sep 17 00:00:00 2001 From: Johnathan Chan Date: Sat, 6 Jul 2024 19:38:55 +1000 Subject: [PATCH 1/4] mpi-base --- CMakeLists.txt | 6 ++- Makefile | 6 ++- src/main.cu | 112 ++++++++++++++++++++++++------------------------- 3 files changed, 63 insertions(+), 61 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 56f9d6c..686754b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,10 +17,14 @@ set(SRC_DIR src) include_directories(${INC_DIR}) +# Find MPI package +find_package(MPI REQUIRED) +include_directories(${MPI_INCLUDE_PATH}) + # Source files for GPU file(GLOB_RECURSE CUDA_SOURCE_FILES ${SRC_DIR}/*.cu) # Create GPU executable add_executable(speed_gpu ${CUDA_SOURCE_FILES}) set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -target_link_libraries(speed_gpu m) +target_link_libraries(speed_gpu m ${MPI_LIBRARIES}) \ No newline at end of file diff --git a/Makefile b/Makefile index d0830e1..7b0c9e0 100644 --- a/Makefile +++ b/Makefile @@ -14,9 +14,11 @@ build: cp build/speed_gpu ./ run: build - ./speed_gpu ./weights_and_biases.txt ./tensors 100000 + n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \ + mpirun --oversubscribe -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 100000 test: build - ./speed_gpu ./weights_and_biases.txt ./tensors 1000000 + n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \ + mpirun --oversubscribe -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 1000000 mv ./results.csv ./test python3 ./test/verify_csv.py \ No newline at end of file diff --git a/src/main.cu b/src/main.cu index 374a80a..9fd7f4c 100644 --- a/src/main.cu +++ b/src/main.cu @@ -1,6 +1,7 @@ #include "matrix.cuh" #include #include +#include #include #include #include @@ -156,11 +157,23 @@ __global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matri } int main(int argc, char* argv[]) { + MPI_Init(&argc, &argv); + int TotalProcess, ProcessId; + MPI_Comm_size(MPI_COMM_WORLD, &TotalProcess); + MPI_Comm_rank(MPI_COMM_WORLD, &ProcessId); + if (argc < 4) { printf("Not enough arguments. Usage: speed_cpu \n"); + MPI_Finalize(); return EXIT_FAILURE; } + // get no of gpu + int deviceCount; + cudaGetDeviceCount(&deviceCount); + int deviceId = ProcessId % deviceCount; + cudaSetDevice(deviceId); + // Start timing struct timeval stop, start; gettimeofday(&start, NULL); @@ -207,22 +220,29 @@ int main(int argc, char* argv[]) { } } - results = (int*)malloc((input_count) * sizeof(int)); - inputs = (float*)malloc((input_count) * sizeof(float) * 225); + int local_input_count = input_count / TotalProcess + (ProcessId < (input_count % TotalProcess) ? 1 : 0); + int start_idx = ProcessId * (input_count / TotalProcess) + std::min(ProcessId, input_count % TotalProcess); + + results = (int*)malloc(local_input_count * sizeof(int)); + inputs = (float*)malloc(local_input_count * sizeof(float) * 225); - cudaMalloc(&d_results, (input_count) * sizeof(int)); - cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225); + cudaMalloc(&d_results, local_input_count * sizeof(int)); + cudaMalloc(&d_inputs, local_input_count * sizeof(float) * 225); dir = opendir(directory_path); + int counter = 0; while ((entry = readdir(dir)) != NULL) { if (entry->d_type == DT_REG) { strcpy(file_num_str, entry->d_name); file_num_str[strlen(entry->d_name) - 7] = '\0'; file_num = atoi(entry->d_name); - strcpy(file_name, directory_path); - strcat(file_name, "/"); - strcat(file_name, entry->d_name); - read_tensor((float*)&inputs[(file_num - 1) * 225], file_name); + if (file_num >= start_idx + 1 && file_num <= start_idx + local_input_count) { + strcpy(file_name, directory_path); + strcat(file_name, "/"); + strcat(file_name, entry->d_name); + read_tensor(&inputs[counter * 225], file_name); + counter++; + } } } @@ -230,71 +250,47 @@ int main(int argc, char* argv[]) { free(file_num_str); closedir(dir); - cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice); - - int deviceCount; - cudaError_t err = cudaGetDeviceCount(&deviceCount); - if (err != cudaSuccess) { - printf("Error: %s\n", cudaGetErrorString(err)); - return -1; - } - - for (int i = 0; i < deviceCount; ++i) { - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, i); - printf("Device %d:\n", i); - printf(" Device Name: %s\n", prop.name); - printf(" Compute Capability: %d.%d\n", prop.major, prop.minor); - printf(" Total Global Memory: %lu bytes\n", prop.totalGlobalMem); - printf(" Shared Memory per Block: %lu bytes\n", prop.sharedMemPerBlock); - printf(" Registers per Block: %d\n", prop.regsPerBlock); - printf(" Warp Size: %d\n", prop.warpSize); - printf(" Max Threads per Block: %d\n", prop.maxThreadsPerBlock); - printf(" Max threads per SM: %d\n", prop.maxThreadsPerMultiProcessor); - printf(" Max Threads Dim: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], - prop.maxThreadsDim[2]); - printf(" Max Grid Size: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); - printf(" Clock Rate: %d kHz\n", prop.clockRate); - printf(" Total Constant Memory: %lu bytes\n", prop.totalConstMem); - printf(" Multiprocessor Count: %d\n", prop.multiProcessorCount); - printf(" Memory Clock Rate: %d kHz\n", prop.memoryClockRate); - printf(" Memory Bus Width: %d bits\n", prop.memoryBusWidth); - printf(" L2 Cache Size: %d bytes\n", prop.l2CacheSize); - printf("\n"); - } - - int minGridSize, blockSize; - cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, infer, 0, 0); - printf("Recommended block size: %d Grid size: %d\n", blockSize, minGridSize); + cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * local_input_count, cudaMemcpyHostToDevice); int it_num = atoi(argv[3]); struct timeval stop1, start1; gettimeofday(&start1, NULL); cudaDeviceSynchronize(); - for (int i = 0; i < input_count; i++) { + for (int i = 0; i < local_input_count; i++) { infer<<>>(d_inputs, d_results, d_weights, d_biases, it_num, i); - err = cudaGetLastError(); - if (err != cudaSuccess) { - printf("CUDA error: %s\n", cudaGetErrorString(err)); - } + CUDA_CHECK(cudaGetLastError()); } cudaDeviceSynchronize(); - cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost); + cudaMemcpy(results, d_results, local_input_count * sizeof(int), cudaMemcpyDeviceToHost); gettimeofday(&stop1, NULL); - printf("- Inference: %lu us\n", (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec); + printf("Process %d - Inference: %lu us\n", ProcessId, + (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec); + + // Gather results at root process + int* all_results = nullptr; + if (ProcessId == 0) { + all_results = (int*)malloc(input_count * sizeof(int)); + } + + MPI_Gather(results, local_input_count, MPI_INT, all_results, local_input_count, MPI_INT, 0, MPI_COMM_WORLD); - FILE* csv_file = fopen("results.csv", "w+"); - fprintf(csv_file, "image_number, guess\n"); - for (int i = 0; i < input_count; i++) { - fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]); + if (ProcessId == 0) { + FILE* csv_file = fopen("results.csv", "w+"); + fprintf(csv_file, "image_number, guess\n"); + for (int i = 0; i < input_count; i++) { + fprintf(csv_file, "%d, %c\n", i + 1, letters[all_results[i]]); + } + fclose(csv_file); + free(all_results); } - fclose(csv_file); // Time taken gettimeofday(&stop, NULL); - printf("- Total: %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); + printf("Process %d - Total: %lu us\n", ProcessId, + (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); + MPI_Finalize(); return EXIT_SUCCESS; -} \ No newline at end of file +} From 42382e1f92909f682c8c307ec898b764771cd37c Mon Sep 17 00:00:00 2001 From: Johnathan Chan Date: Sat, 6 Jul 2024 21:50:40 +1000 Subject: [PATCH 2/4] divide by inference --- Makefile | 4 ++-- src/main.cu | 62 ++++++++++++++++++++--------------------------------- 2 files changed, 25 insertions(+), 41 deletions(-) diff --git a/Makefile b/Makefile index 7b0c9e0..a3fcd27 100644 --- a/Makefile +++ b/Makefile @@ -15,10 +15,10 @@ build: run: build n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \ - mpirun --oversubscribe -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 100000 + mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 100000 test: build n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \ - mpirun --oversubscribe -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 1000000 + mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 1000000 mv ./results.csv ./test python3 ./test/verify_csv.py \ No newline at end of file diff --git a/src/main.cu b/src/main.cu index 9fd7f4c..6d60f27 100644 --- a/src/main.cu +++ b/src/main.cu @@ -1,6 +1,5 @@ #include "matrix.cuh" #include -#include #include #include #include @@ -159,8 +158,8 @@ __global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matri int main(int argc, char* argv[]) { MPI_Init(&argc, &argv); int TotalProcess, ProcessId; - MPI_Comm_size(MPI_COMM_WORLD, &TotalProcess); - MPI_Comm_rank(MPI_COMM_WORLD, &ProcessId); + MPI_Comm_size(MPI_COMM_WORLD, &TotalProcess); // size + MPI_Comm_rank(MPI_COMM_WORLD, &ProcessId); // gpuid if (argc < 4) { printf("Not enough arguments. Usage: speed_cpu \n"); @@ -219,30 +218,22 @@ int main(int argc, char* argv[]) { input_count++; } } + results = (int*)malloc((input_count) * sizeof(int)); + inputs = (float*)malloc((input_count) * sizeof(float) * 225); - int local_input_count = input_count / TotalProcess + (ProcessId < (input_count % TotalProcess) ? 1 : 0); - int start_idx = ProcessId * (input_count / TotalProcess) + std::min(ProcessId, input_count % TotalProcess); - - results = (int*)malloc(local_input_count * sizeof(int)); - inputs = (float*)malloc(local_input_count * sizeof(float) * 225); - - cudaMalloc(&d_results, local_input_count * sizeof(int)); - cudaMalloc(&d_inputs, local_input_count * sizeof(float) * 225); + cudaMalloc(&d_results, (input_count) * sizeof(int)); + cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225); dir = opendir(directory_path); - int counter = 0; while ((entry = readdir(dir)) != NULL) { if (entry->d_type == DT_REG) { strcpy(file_num_str, entry->d_name); file_num_str[strlen(entry->d_name) - 7] = '\0'; file_num = atoi(entry->d_name); - if (file_num >= start_idx + 1 && file_num <= start_idx + local_input_count) { - strcpy(file_name, directory_path); - strcat(file_name, "/"); - strcat(file_name, entry->d_name); - read_tensor(&inputs[counter * 225], file_name); - counter++; - } + strcpy(file_name, directory_path); + strcat(file_name, "/"); + strcat(file_name, entry->d_name); + read_tensor((float*)&inputs[(file_num - 1) * 225], file_name); } } @@ -250,47 +241,40 @@ int main(int argc, char* argv[]) { free(file_num_str); closedir(dir); - cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * local_input_count, cudaMemcpyHostToDevice); + cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice); int it_num = atoi(argv[3]); + // divide this doma //when u launch 8 gpu it divide automatically yeah //handles remainder + int gpu_it_num = it_num / TotalProcess + (ProcessId < (it_num % TotalProcess) ? 1 : 0); + struct timeval stop1, start1; gettimeofday(&start1, NULL); cudaDeviceSynchronize(); - for (int i = 0; i < local_input_count; i++) { - infer<<>>(d_inputs, d_results, d_weights, d_biases, it_num, i); + for (int i = 0; i < input_count; i++) { + infer<<>>(d_inputs, d_results, d_weights, d_biases, gpu_it_num, i); CUDA_CHECK(cudaGetLastError()); } cudaDeviceSynchronize(); - cudaMemcpy(results, d_results, local_input_count * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost); gettimeofday(&stop1, NULL); printf("Process %d - Inference: %lu us\n", ProcessId, (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec); - // Gather results at root process - int* all_results = nullptr; - if (ProcessId == 0) { - all_results = (int*)malloc(input_count * sizeof(int)); - } + // this cheat xd dan no verify xddd - MPI_Gather(results, local_input_count, MPI_INT, all_results, local_input_count, MPI_INT, 0, MPI_COMM_WORLD); - - if (ProcessId == 0) { - FILE* csv_file = fopen("results.csv", "w+"); - fprintf(csv_file, "image_number, guess\n"); - for (int i = 0; i < input_count; i++) { - fprintf(csv_file, "%d, %c\n", i + 1, letters[all_results[i]]); - } - fclose(csv_file); - free(all_results); + FILE* csv_file = fopen("results.csv", "w+"); + fprintf(csv_file, "image_number, guess\n"); + for (int i = 0; i < input_count; i++) { + fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]); } + fclose(csv_file); // Time taken gettimeofday(&stop, NULL); printf("Process %d - Total: %lu us\n", ProcessId, (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); - MPI_Finalize(); return EXIT_SUCCESS; } From 23ff410de4d0037b76cde7640525a156d6029f5c Mon Sep 17 00:00:00 2001 From: Johnathan Chan Date: Sat, 6 Jul 2024 23:11:08 +1000 Subject: [PATCH 3/4] Added mpi-optimization --- src/main.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main.cu b/src/main.cu index 6d60f27..4b233ba 100644 --- a/src/main.cu +++ b/src/main.cu @@ -261,13 +261,14 @@ int main(int argc, char* argv[]) { gettimeofday(&stop1, NULL); printf("Process %d - Inference: %lu us\n", ProcessId, (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec); - + MPI_Finalize(); // this cheat xd dan no verify xddd FILE* csv_file = fopen("results.csv", "w+"); fprintf(csv_file, "image_number, guess\n"); for (int i = 0; i < input_count; i++) { fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]); + printf("dan is gay =%d \n", ProcessId); } fclose(csv_file); @@ -275,6 +276,6 @@ int main(int argc, char* argv[]) { gettimeofday(&stop, NULL); printf("Process %d - Total: %lu us\n", ProcessId, (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); - MPI_Finalize(); + return EXIT_SUCCESS; } From 70c358b8ff53bd8851d39c19ea4fdc1a851b1991 Mon Sep 17 00:00:00 2001 From: Johnathan Chan Date: Sat, 6 Jul 2024 23:12:57 +1000 Subject: [PATCH 4/4] fine tuning --- src/main.cu | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/src/main.cu b/src/main.cu index 4b233ba..e7ba3ce 100644 --- a/src/main.cu +++ b/src/main.cu @@ -154,7 +154,6 @@ __global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matri d_results[in_num] = argmax(out1, 52); } } - int main(int argc, char* argv[]) { MPI_Init(&argc, &argv); int TotalProcess, ProcessId; @@ -257,25 +256,22 @@ int main(int argc, char* argv[]) { } cudaDeviceSynchronize(); - cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost); - gettimeofday(&stop1, NULL); - printf("Process %d - Inference: %lu us\n", ProcessId, - (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec); - MPI_Finalize(); - // this cheat xd dan no verify xddd - - FILE* csv_file = fopen("results.csv", "w+"); - fprintf(csv_file, "image_number, guess\n"); - for (int i = 0; i < input_count; i++) { - fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]); - printf("dan is gay =%d \n", ProcessId); + if (ProcessId == 0) { + cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost); + gettimeofday(&stop1, NULL); + printf("Process %d - Inference: %lu us\n", ProcessId, + (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec); + FILE* csv_file = fopen("results.csv", "w+"); + fprintf(csv_file, "image_number, guess\n"); + for (int i = 0; i < input_count; i++) { + fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]); + } + fclose(csv_file); } - fclose(csv_file); - // Time taken gettimeofday(&stop, NULL); printf("Process %d - Total: %lu us\n", ProcessId, (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); - + MPI_Finalize(); return EXIT_SUCCESS; }