Skip to content

Commit

Permalink
Merge pull request #26 from nhatdongdang/feat/gpu-multithread
Browse files Browse the repository at this point in the history
Gpu multithread
  • Loading branch information
nhatdongdang authored Jul 5, 2024
2 parents a4b925b + e8db37a commit 2e2c1e5
Show file tree
Hide file tree
Showing 10 changed files with 321 additions and 103 deletions.
19 changes: 19 additions & 0 deletions benchmark/matrix_add/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
compile = nvcc -O3 -arch=sm_75 --use_fast_math
SRC_DIR := versions
BIN_DIR := bin
SRC_FILES := $(wildcard $(SRC_DIR)/*.cu)
EXECUTABLES := $(patsubst $(SRC_DIR)/%.cu, $(BIN_DIR)/%, $(SRC_FILES))

all: clean $(EXECUTABLES)

clean:
rm -f -r bin
mkdir bin

$(BIN_DIR)/%: $(SRC_DIR)/%.cu
$(compile) $< benchmark.cu -o $@.exe

plot: all
python3 ./plot.py


13 changes: 13 additions & 0 deletions benchmark/matrix_add/benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include "template.cuh"
#include <stdio.h>
#include <time.h>

int main(int argc, char* argv[]) {
long n;
if (argc > 1) {
n = atol(argv[1]);
} else {
n = 100000;
}
printf("%f", time(n));
}
Binary file added benchmark/matrix_add/benchmark_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
50 changes: 50 additions & 0 deletions benchmark/matrix_add/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import subprocess
import matplotlib.pyplot as plt

result = subprocess.run(['make'], capture_output=True, text=True)
# Define the folder containing the executables
folder_path = './bin' # Change this to your bin folder path

# Define the input sizes to test
start=10000
end=10000
step=100000

input_sizes = list(range(start, end+1, step))
# Initialize a dictionary to store runtimes for each executable
runtimes = {exe: [] for exe in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, exe))}

# Loop through each executable
for exe in runtimes.keys():
exe_path = os.path.join(folder_path, exe)

# Loop through each input size
for n in range(start,end+1,step):
# Run the executable with the input size and capture its output
result = subprocess.run([exe_path, str(n)], capture_output=True, text=True)

# Parse the output to get the runtime
runtime = float(result.stdout.strip())
print(exe,runtime)

# Append the runtime to the corresponding executable list
runtimes[exe].append(runtime)

# Plot the data
plt.figure(figsize=(12, 6))

# Loop through each executable and plot the runtimes
for exe, times in runtimes.items():
plt.plot(input_sizes, times, marker='o', label=exe)

plt.xlabel('Iterations')
plt.ylabel('Runtime (s)')
plt.title('Benchmark of Function Versions')
plt.legend()
plt.grid(True)
plt.tight_layout()

output_file = 'benchmark_plot.png' # Specify your desired output file name and format
plt.savefig(output_file)
# Show the plot
10 changes: 10 additions & 0 deletions benchmark/matrix_add/template.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once

typedef struct {
int rows;
int cols;
float* data; // array
} matrix;

double time(int n);
matrix* new_matrix_d(int rows, int cols);
44 changes: 44 additions & 0 deletions benchmark/matrix_add/versions/1.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include "../template.cuh"

matrix* new_matrix(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->data = (float*)malloc((rows * cols) * sizeof(float));
return res;
}

matrix* new_matrix_d(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->cols = cols;
cudaMalloc((void**)&(res->data), rows * cols * sizeof(float));
return res;
}

__global__ void matrix_add(float *a, float*b ,int rows)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<rows){
a[idx]+=b[idx];
}
}

double time(int n) {
int row=100000;
matrix* a = new_matrix_d(row, 1);
matrix* b = new_matrix_d(row, 1);
cudaStream_t stream1;
cudaStreamCreate ( &stream1);

int thread=1024;
int block=((row+thread-1)/thread);

clock_t start = clock();
for(int i=0;i<n;i++){
matrix_add<<<1,1,0,stream1>>>(a->data,b->data,row);
}
double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC;
return seconds;
}
37 changes: 37 additions & 0 deletions benchmark/matrix_add/versions/cpu.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#include "../template.cuh"

matrix* new_matrix(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->data = (float*)malloc((rows * cols) * sizeof(float));
return res;
}

matrix* new_matrix_d(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->cols = cols;
cudaMalloc((void**)&(res->data), rows * cols * sizeof(float));
return res;
}

void matrix_add(float* a, float* b, int rows) {
for (int i = 0; i < rows; i++) {
a[i] += b[i];
}
}

double time(int n) {
int row=100000;
matrix* a = new_matrix(row, 1);
matrix* b = new_matrix(row, 1);

clock_t start = clock();
for (int i = 0; i < n; i++) {
matrix_add(a->data, b->data,row);
}
double seconds = (double)(clock() - (double)start) / CLOCKS_PER_SEC;
return seconds;
}
Loading

0 comments on commit 2e2c1e5

Please sign in to comment.