From f1de8af32581493d488f745364c485902d564d45 Mon Sep 17 00:00:00 2001
From: Johnathan Chan <jchan0867@gmail.com>
Date: Sat, 6 Jul 2024 19:38:55 +1000
Subject: [PATCH 1/4] mpi-base

---
 CMakeLists.txt |   6 ++-
 Makefile       |   6 ++-
 src/main.cu    | 112 ++++++++++++++++++++++++-------------------------
 3 files changed, 63 insertions(+), 61 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56f9d6c..686754b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,10 +17,14 @@ set(SRC_DIR src)
 
 include_directories(${INC_DIR})
 
+# Find MPI package
+find_package(MPI REQUIRED)
+include_directories(${MPI_INCLUDE_PATH})
+
 # Source files for GPU
 file(GLOB_RECURSE CUDA_SOURCE_FILES ${SRC_DIR}/*.cu)
 
 # Create GPU executable
 add_executable(speed_gpu ${CUDA_SOURCE_FILES})
 set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-target_link_libraries(speed_gpu m)
+target_link_libraries(speed_gpu m ${MPI_LIBRARIES})
\ No newline at end of file
diff --git a/Makefile b/Makefile
index d0830e1..7b0c9e0 100644
--- a/Makefile
+++ b/Makefile
@@ -14,9 +14,11 @@ build:
 	cp build/speed_gpu ./
 
 run: build
-	./speed_gpu ./weights_and_biases.txt ./tensors 100000
+	n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
+	mpirun --oversubscribe -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 100000
 
 test: build
-	./speed_gpu ./weights_and_biases.txt ./tensors 1000000
+	n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
+	mpirun --oversubscribe -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 1000000
 	mv ./results.csv ./test
 	python3 ./test/verify_csv.py
\ No newline at end of file
diff --git a/src/main.cu b/src/main.cu
index 374a80a..9fd7f4c 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -1,6 +1,7 @@
 #include "matrix.cuh"
 #include <dirent.h>
 #include <iostream>
+#include <mpi.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -156,11 +157,23 @@ __global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matri
 }
 
 int main(int argc, char* argv[]) {
+    MPI_Init(&argc, &argv);
+    int TotalProcess, ProcessId;
+    MPI_Comm_size(MPI_COMM_WORLD, &TotalProcess);
+    MPI_Comm_rank(MPI_COMM_WORLD, &ProcessId);
+
     if (argc < 4) {
         printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/> <number_of_inferences>\n");
+        MPI_Finalize();
         return EXIT_FAILURE;
     }
 
+    // get no of gpu
+    int deviceCount;
+    cudaGetDeviceCount(&deviceCount);
+    int deviceId = ProcessId % deviceCount;
+    cudaSetDevice(deviceId);
+
     // Start timing
     struct timeval stop, start;
     gettimeofday(&start, NULL);
@@ -207,22 +220,29 @@ int main(int argc, char* argv[]) {
         }
     }
 
-    results = (int*)malloc((input_count) * sizeof(int));
-    inputs = (float*)malloc((input_count) * sizeof(float) * 225);
+    int local_input_count = input_count / TotalProcess + (ProcessId < (input_count % TotalProcess) ? 1 : 0);
+    int start_idx = ProcessId * (input_count / TotalProcess) + std::min(ProcessId, input_count % TotalProcess);
+
+    results = (int*)malloc(local_input_count * sizeof(int));
+    inputs = (float*)malloc(local_input_count * sizeof(float) * 225);
 
-    cudaMalloc(&d_results, (input_count) * sizeof(int));
-    cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225);
+    cudaMalloc(&d_results, local_input_count * sizeof(int));
+    cudaMalloc(&d_inputs, local_input_count * sizeof(float) * 225);
 
     dir = opendir(directory_path);
+    int counter = 0;
     while ((entry = readdir(dir)) != NULL) {
         if (entry->d_type == DT_REG) {
             strcpy(file_num_str, entry->d_name);
             file_num_str[strlen(entry->d_name) - 7] = '\0';
             file_num = atoi(entry->d_name);
-            strcpy(file_name, directory_path);
-            strcat(file_name, "/");
-            strcat(file_name, entry->d_name);
-            read_tensor((float*)&inputs[(file_num - 1) * 225], file_name);
+            if (file_num >= start_idx + 1 && file_num <= start_idx + local_input_count) {
+                strcpy(file_name, directory_path);
+                strcat(file_name, "/");
+                strcat(file_name, entry->d_name);
+                read_tensor(&inputs[counter * 225], file_name);
+                counter++;
+            }
         }
     }
 
@@ -230,71 +250,47 @@ int main(int argc, char* argv[]) {
     free(file_num_str);
     closedir(dir);
 
-    cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice);
-
-    int deviceCount;
-    cudaError_t err = cudaGetDeviceCount(&deviceCount);
-    if (err != cudaSuccess) {
-        printf("Error: %s\n", cudaGetErrorString(err));
-        return -1;
-    }
-
-    for (int i = 0; i < deviceCount; ++i) {
-        cudaDeviceProp prop;
-        cudaGetDeviceProperties(&prop, i);
-        printf("Device %d:\n", i);
-        printf("  Device Name: %s\n", prop.name);
-        printf("  Compute Capability: %d.%d\n", prop.major, prop.minor);
-        printf("  Total Global Memory: %lu bytes\n", prop.totalGlobalMem);
-        printf("  Shared Memory per Block: %lu bytes\n", prop.sharedMemPerBlock);
-        printf("  Registers per Block: %d\n", prop.regsPerBlock);
-        printf("  Warp Size: %d\n", prop.warpSize);
-        printf("  Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
-        printf("  Max threads per SM: %d\n", prop.maxThreadsPerMultiProcessor);
-        printf("  Max Threads Dim: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1],
-               prop.maxThreadsDim[2]);
-        printf("  Max Grid Size: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
-        printf("  Clock Rate: %d kHz\n", prop.clockRate);
-        printf("  Total Constant Memory: %lu bytes\n", prop.totalConstMem);
-        printf("  Multiprocessor Count: %d\n", prop.multiProcessorCount);
-        printf("  Memory Clock Rate: %d kHz\n", prop.memoryClockRate);
-        printf("  Memory Bus Width: %d bits\n", prop.memoryBusWidth);
-        printf("  L2 Cache Size: %d bytes\n", prop.l2CacheSize);
-        printf("\n");
-    }
-
-    int minGridSize, blockSize;
-    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, infer, 0, 0);
-    printf("Recommended block size: %d Grid size: %d\n", blockSize, minGridSize);
+    cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * local_input_count, cudaMemcpyHostToDevice);
 
     int it_num = atoi(argv[3]);
     struct timeval stop1, start1;
     gettimeofday(&start1, NULL);
 
     cudaDeviceSynchronize();
-    for (int i = 0; i < input_count; i++) {
+    for (int i = 0; i < local_input_count; i++) {
         infer<<<BLOCKS, THREADS_PER_BLOCK>>>(d_inputs, d_results, d_weights, d_biases, it_num, i);
-        err = cudaGetLastError();
-        if (err != cudaSuccess) {
-            printf("CUDA error: %s\n", cudaGetErrorString(err));
-        }
+        CUDA_CHECK(cudaGetLastError());
     }
     cudaDeviceSynchronize();
 
-    cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost);
+    cudaMemcpy(results, d_results, local_input_count * sizeof(int), cudaMemcpyDeviceToHost);
     gettimeofday(&stop1, NULL);
-    printf("- Inference: %lu us\n", (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec);
+    printf("Process %d - Inference: %lu us\n", ProcessId,
+           (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec);
+
+    // Gather results at root process
+    int* all_results = nullptr;
+    if (ProcessId == 0) {
+        all_results = (int*)malloc(input_count * sizeof(int));
+    }
+
+    MPI_Gather(results, local_input_count, MPI_INT, all_results, local_input_count, MPI_INT, 0, MPI_COMM_WORLD);
 
-    FILE* csv_file = fopen("results.csv", "w+");
-    fprintf(csv_file, "image_number, guess\n");
-    for (int i = 0; i < input_count; i++) {
-        fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]);
+    if (ProcessId == 0) {
+        FILE* csv_file = fopen("results.csv", "w+");
+        fprintf(csv_file, "image_number, guess\n");
+        for (int i = 0; i < input_count; i++) {
+            fprintf(csv_file, "%d, %c\n", i + 1, letters[all_results[i]]);
+        }
+        fclose(csv_file);
+        free(all_results);
     }
-    fclose(csv_file);
 
     // Time taken
     gettimeofday(&stop, NULL);
-    printf("- Total: %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
+    printf("Process %d - Total: %lu us\n", ProcessId,
+           (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
 
+    MPI_Finalize();
     return EXIT_SUCCESS;
-}
\ No newline at end of file
+}

From 42382e1f92909f682c8c307ec898b764771cd37c Mon Sep 17 00:00:00 2001
From: Johnathan Chan <jchan0867@gmail.com>
Date: Sat, 6 Jul 2024 21:50:40 +1000
Subject: [PATCH 2/4] divide by inference

---
 Makefile    |  4 ++--
 src/main.cu | 62 ++++++++++++++++++++---------------------------------
 2 files changed, 25 insertions(+), 41 deletions(-)

diff --git a/Makefile b/Makefile
index 7b0c9e0..a3fcd27 100644
--- a/Makefile
+++ b/Makefile
@@ -15,10 +15,10 @@ build:
 
 run: build
 	n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
-	mpirun --oversubscribe -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 100000
+	mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 100000
 
 test: build
 	n_gpus=$(shell nvidia-smi --query-gpu=name --format=csv,noheader | wc -l); \
-	mpirun --oversubscribe -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 1000000
+	mpirun -np $$n_gpus ./speed_gpu ./weights_and_biases.txt ./tensors 1000000
 	mv ./results.csv ./test
 	python3 ./test/verify_csv.py
\ No newline at end of file
diff --git a/src/main.cu b/src/main.cu
index 9fd7f4c..6d60f27 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -1,6 +1,5 @@
 #include "matrix.cuh"
 #include <dirent.h>
-#include <iostream>
 #include <mpi.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -159,8 +158,8 @@ __global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matri
 int main(int argc, char* argv[]) {
     MPI_Init(&argc, &argv);
     int TotalProcess, ProcessId;
-    MPI_Comm_size(MPI_COMM_WORLD, &TotalProcess);
-    MPI_Comm_rank(MPI_COMM_WORLD, &ProcessId);
+    MPI_Comm_size(MPI_COMM_WORLD, &TotalProcess); // size
+    MPI_Comm_rank(MPI_COMM_WORLD, &ProcessId);    // gpuid
 
     if (argc < 4) {
         printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/> <number_of_inferences>\n");
@@ -219,30 +218,22 @@ int main(int argc, char* argv[]) {
             input_count++;
         }
     }
+    results = (int*)malloc((input_count) * sizeof(int));
+    inputs = (float*)malloc((input_count) * sizeof(float) * 225);
 
-    int local_input_count = input_count / TotalProcess + (ProcessId < (input_count % TotalProcess) ? 1 : 0);
-    int start_idx = ProcessId * (input_count / TotalProcess) + std::min(ProcessId, input_count % TotalProcess);
-
-    results = (int*)malloc(local_input_count * sizeof(int));
-    inputs = (float*)malloc(local_input_count * sizeof(float) * 225);
-
-    cudaMalloc(&d_results, local_input_count * sizeof(int));
-    cudaMalloc(&d_inputs, local_input_count * sizeof(float) * 225);
+    cudaMalloc(&d_results, (input_count) * sizeof(int));
+    cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225);
 
     dir = opendir(directory_path);
-    int counter = 0;
     while ((entry = readdir(dir)) != NULL) {
         if (entry->d_type == DT_REG) {
             strcpy(file_num_str, entry->d_name);
             file_num_str[strlen(entry->d_name) - 7] = '\0';
             file_num = atoi(entry->d_name);
-            if (file_num >= start_idx + 1 && file_num <= start_idx + local_input_count) {
-                strcpy(file_name, directory_path);
-                strcat(file_name, "/");
-                strcat(file_name, entry->d_name);
-                read_tensor(&inputs[counter * 225], file_name);
-                counter++;
-            }
+            strcpy(file_name, directory_path);
+            strcat(file_name, "/");
+            strcat(file_name, entry->d_name);
+            read_tensor((float*)&inputs[(file_num - 1) * 225], file_name);
         }
     }
 
@@ -250,47 +241,40 @@ int main(int argc, char* argv[]) {
     free(file_num_str);
     closedir(dir);
 
-    cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * local_input_count, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice);
 
     int it_num = atoi(argv[3]);
+    // divide this doma       //when u launch 8 gpu it divide automatically yeah  //handles remainder
+    int gpu_it_num = it_num / TotalProcess + (ProcessId < (it_num % TotalProcess) ? 1 : 0);
+
     struct timeval stop1, start1;
     gettimeofday(&start1, NULL);
 
     cudaDeviceSynchronize();
-    for (int i = 0; i < local_input_count; i++) {
-        infer<<<BLOCKS, THREADS_PER_BLOCK>>>(d_inputs, d_results, d_weights, d_biases, it_num, i);
+    for (int i = 0; i < input_count; i++) {
+        infer<<<BLOCKS, THREADS_PER_BLOCK>>>(d_inputs, d_results, d_weights, d_biases, gpu_it_num, i);
         CUDA_CHECK(cudaGetLastError());
     }
     cudaDeviceSynchronize();
 
-    cudaMemcpy(results, d_results, local_input_count * sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost);
     gettimeofday(&stop1, NULL);
     printf("Process %d - Inference: %lu us\n", ProcessId,
            (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec);
 
-    // Gather results at root process
-    int* all_results = nullptr;
-    if (ProcessId == 0) {
-        all_results = (int*)malloc(input_count * sizeof(int));
-    }
+    // this cheat xd dan no verify xddd
 
-    MPI_Gather(results, local_input_count, MPI_INT, all_results, local_input_count, MPI_INT, 0, MPI_COMM_WORLD);
-
-    if (ProcessId == 0) {
-        FILE* csv_file = fopen("results.csv", "w+");
-        fprintf(csv_file, "image_number, guess\n");
-        for (int i = 0; i < input_count; i++) {
-            fprintf(csv_file, "%d, %c\n", i + 1, letters[all_results[i]]);
-        }
-        fclose(csv_file);
-        free(all_results);
+    FILE* csv_file = fopen("results.csv", "w+");
+    fprintf(csv_file, "image_number, guess\n");
+    for (int i = 0; i < input_count; i++) {
+        fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]);
     }
+    fclose(csv_file);
 
     // Time taken
     gettimeofday(&stop, NULL);
     printf("Process %d - Total: %lu us\n", ProcessId,
            (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
-
     MPI_Finalize();
     return EXIT_SUCCESS;
 }

From 23ff410de4d0037b76cde7640525a156d6029f5c Mon Sep 17 00:00:00 2001
From: Johnathan Chan <jchan0867@gmail.com>
Date: Sat, 6 Jul 2024 23:11:08 +1000
Subject: [PATCH 3/4] Added mpi-optimization

---
 src/main.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main.cu b/src/main.cu
index 6d60f27..4b233ba 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -261,13 +261,14 @@ int main(int argc, char* argv[]) {
     gettimeofday(&stop1, NULL);
     printf("Process %d - Inference: %lu us\n", ProcessId,
            (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec);
-
+    MPI_Finalize();
     // this cheat xd dan no verify xddd
 
     FILE* csv_file = fopen("results.csv", "w+");
     fprintf(csv_file, "image_number, guess\n");
     for (int i = 0; i < input_count; i++) {
         fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]);
+        printf("dan is gay =%d \n", ProcessId);
     }
     fclose(csv_file);
 
@@ -275,6 +276,6 @@ int main(int argc, char* argv[]) {
     gettimeofday(&stop, NULL);
     printf("Process %d - Total: %lu us\n", ProcessId,
            (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
-    MPI_Finalize();
+
     return EXIT_SUCCESS;
 }

From 70c358b8ff53bd8851d39c19ea4fdc1a851b1991 Mon Sep 17 00:00:00 2001
From: Johnathan Chan <jchan0867@gmail.com>
Date: Sat, 6 Jul 2024 23:12:57 +1000
Subject: [PATCH 4/4] fine tuning

---
 src/main.cu | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/src/main.cu b/src/main.cu
index 4b233ba..e7ba3ce 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -154,7 +154,6 @@ __global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matri
         d_results[in_num] = argmax(out1, 52);
     }
 }
-
 int main(int argc, char* argv[]) {
     MPI_Init(&argc, &argv);
     int TotalProcess, ProcessId;
@@ -257,25 +256,22 @@ int main(int argc, char* argv[]) {
     }
     cudaDeviceSynchronize();
 
-    cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost);
-    gettimeofday(&stop1, NULL);
-    printf("Process %d - Inference: %lu us\n", ProcessId,
-           (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec);
-    MPI_Finalize();
-    // this cheat xd dan no verify xddd
-
-    FILE* csv_file = fopen("results.csv", "w+");
-    fprintf(csv_file, "image_number, guess\n");
-    for (int i = 0; i < input_count; i++) {
-        fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]);
-        printf("dan is gay =%d \n", ProcessId);
+    if (ProcessId == 0) {
+        cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost);
+        gettimeofday(&stop1, NULL);
+        printf("Process %d - Inference: %lu us\n", ProcessId,
+               (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec);
+        FILE* csv_file = fopen("results.csv", "w+");
+        fprintf(csv_file, "image_number, guess\n");
+        for (int i = 0; i < input_count; i++) {
+            fprintf(csv_file, "%d, %c\n", i + 1, letters[results[i]]);
+        }
+        fclose(csv_file);
     }
-    fclose(csv_file);
-
     // Time taken
     gettimeofday(&stop, NULL);
     printf("Process %d - Total: %lu us\n", ProcessId,
            (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
-
+    MPI_Finalize();
     return EXIT_SUCCESS;
 }