Merge pull request #26 from nhatdongdang/feat/gpu-multithread

Gpu multithread
kachi-group · Jul 5, 2024 · 2e2c1e5 · 2e2c1e5
2 parents a4b925b + e8db37a
commit 2e2c1e5
Show file tree

Hide file tree

Showing 10 changed files with 321 additions and 103 deletions.
diff --git a/benchmark/matrix_add/Makefile b/benchmark/matrix_add/Makefile
@@ -0,0 +1,19 @@
+compile = nvcc -O3 -arch=sm_75 --use_fast_math 
+SRC_DIR := versions
+BIN_DIR := bin
+SRC_FILES := $(wildcard $(SRC_DIR)/*.cu)
+EXECUTABLES := $(patsubst $(SRC_DIR)/%.cu, $(BIN_DIR)/%, $(SRC_FILES))
+
+all: clean $(EXECUTABLES) 
+
+clean:
+	rm -f -r bin
+	mkdir bin
+
+$(BIN_DIR)/%: $(SRC_DIR)/%.cu
+	$(compile) $< benchmark.cu -o $@.exe 
+
+plot: all
+	python3 ./plot.py
+
+
diff --git a/benchmark/matrix_add/benchmark.cu b/benchmark/matrix_add/benchmark.cu
@@ -0,0 +1,13 @@
+#include "template.cuh"
+#include <stdio.h>
+#include <time.h>
+
+int main(int argc, char* argv[]) {
+    long n;
+    if (argc > 1) {
+        n = atol(argv[1]);
+    } else {
+        n = 100000;
+    }
+    printf("%f", time(n));
+}
diff --git a/benchmark/matrix_add/benchmark_plot.png b/benchmark/matrix_add/benchmark_plot.png
diff --git a/benchmark/matrix_add/plot.py b/benchmark/matrix_add/plot.py
@@ -0,0 +1,50 @@
+import os
+import subprocess
+import matplotlib.pyplot as plt
+
+result = subprocess.run(['make'], capture_output=True, text=True)
+# Define the folder containing the executables
+folder_path = './bin'  # Change this to your bin folder path
+
+# Define the input sizes to test
+start=10000
+end=10000
+step=100000
+
+input_sizes = list(range(start, end+1, step))
+# Initialize a dictionary to store runtimes for each executable
+runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))}
+
+# Loop through each executable
+for exe in runtimes.keys():
+    exe_path = os.path.join(folder_path, exe)
+
+    # Loop through each input size
+    for n in range(start,end+1,step):
+        # Run the executable with the input size and capture its output
+        result = subprocess.run([exe_path, str(n)], capture_output=True, text=True)
+
+        # Parse the output to get the runtime
+        runtime = float(result.stdout.strip())
+        print(exe,runtime)
+
+        # Append the runtime to the corresponding executable list
+        runtimes[exe].append(runtime)
+
+# Plot the data
+plt.figure(figsize=(12, 6))
+
+# Loop through each executable and plot the runtimes
+for exe, times in runtimes.items():
+    plt.plot(input_sizes, times, marker='o', label=exe)
+
+plt.xlabel('Iterations')
+plt.ylabel('Runtime (s)')
+plt.title('Benchmark of Function Versions')
+plt.legend()
+plt.grid(True)
+plt.tight_layout()
+
+output_file = 'benchmark_plot.png'  # Specify your desired output file name and format
+plt.savefig(output_file)
+# Show the plot
diff --git a/benchmark/matrix_add/template.cuh b/benchmark/matrix_add/template.cuh
@@ -0,0 +1,10 @@
+#pragma once
+
+typedef struct {
+    int rows;
+    int cols;
+    float* data; // array
+} matrix;
+
+double time(int n);
+matrix* new_matrix_d(int rows, int cols);
diff --git a/benchmark/matrix_add/versions/1.cu b/benchmark/matrix_add/versions/1.cu
@@ -0,0 +1,44 @@
+#include "../template.cuh"
+
+matrix* new_matrix(int rows, int cols) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = rows;
+    res->cols = cols;
+    res->data = (float*)malloc((rows * cols) * sizeof(float));
+    return res;
+}
+
+matrix* new_matrix_d(int rows, int cols) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = rows;
+    res->cols = cols;
+    res->cols = cols;
+    cudaMalloc((void**)&(res->data), rows * cols * sizeof(float));
+    return res;
+}
+
+__global__ void matrix_add(float *a, float*b ,int rows)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx<rows){
+        a[idx]+=b[idx];
+    }
+}
+
+double time(int n) {
+    int row=100000;
+    matrix* a = new_matrix_d(row, 1);
+    matrix* b = new_matrix_d(row, 1);
+    cudaStream_t stream1;
+    cudaStreamCreate ( &stream1);
+
+    int thread=1024;
+    int block=((row+thread-1)/thread);
+
+    clock_t start = clock();
+    for(int i=0;i<n;i++){
+        matrix_add<<<1,1,0,stream1>>>(a->data,b->data,row);
+    }
+    double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC;
+    return seconds;
+}
diff --git a/benchmark/matrix_add/versions/cpu.cu b/benchmark/matrix_add/versions/cpu.cu
@@ -0,0 +1,37 @@
+#include "../template.cuh"
+
+matrix* new_matrix(int rows, int cols) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = rows;
+    res->cols = cols;
+    res->data = (float*)malloc((rows * cols) * sizeof(float));
+    return res;
+}
+
+matrix* new_matrix_d(int rows, int cols) {
+    matrix* res = (matrix*)malloc(sizeof(matrix));
+    res->rows = rows;
+    res->cols = cols;
+    res->cols = cols;
+    cudaMalloc((void**)&(res->data), rows * cols * sizeof(float));
+    return res;
+}
+
+void matrix_add(float* a, float* b, int rows) {
+    for (int i = 0; i < rows; i++) {
+        a[i] += b[i];
+    }
+}
+
+double time(int n) {
+    int row=100000;
+    matrix* a = new_matrix(row, 1);
+    matrix* b = new_matrix(row, 1);
+
+    clock_t start = clock();
+    for (int i = 0; i < n; i++) {
+        matrix_add(a->data, b->data,row);
+    }
+    double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC;
+    return seconds;
+}