Skip to content

Commit

Permalink
Merge pull request #30 from nhatdongdang/feat/optimize-param
Browse files Browse the repository at this point in the history
Gpu optimize and fix precision loss
  • Loading branch information
rozukke authored Jul 6, 2024
2 parents 9759b85 + dd98f99 commit 02dc834
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 62 deletions.
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ project(ichida-algo LANGUAGES C CXX CUDA)

# Set compiler flags
set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -Wall -Wextra")
set(CMAKE_C_STANDARD 99)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED True)
set(CMAKE_VERBOSE_MAKEFILE ON)

# Ensure CUDA NVCC flags are set properly
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -arch=sm_75")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80")

set(INC_DIR include)
set(SRC_DIR src)
Expand All @@ -23,4 +23,4 @@ file(GLOB_RECURSE CUDA_SOURCE_FILES ${SRC_DIR}/*.cu)
# Create GPU executable
add_executable(speed_gpu ${CUDA_SOURCE_FILES})
set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(speed_gpu m)
target_link_libraries(speed_gpu m)
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ run: build
./speed_gpu ./weights_and_biases.txt ./tensors 100000

test: build
./speed_gpu ./weights_and_biases.txt ./tensors 100000
./speed_gpu ./weights_and_biases.txt ./tensors 1000000
mv ./results.csv ./test
python3 ./test/verify_csv.py
143 changes: 93 additions & 50 deletions src/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ matrix* biases[NUM_LAYERS];
// device weights and biases;
matrix** d_weights;
matrix** d_biases;
matrix** d_inputs;

float* inputs;
float* d_inputs;
int* results;
int* d_results;

Expand Down Expand Up @@ -83,7 +84,7 @@ void read_model(const char* file_name) {
fclose(file);
}

void read_tensor(matrix* a, const char* fileName) {
void read_tensor(float* a, const char* fileName) {
FILE* file = fopen(fileName, "r");
char* line = NULL;
size_t len = 0;
Expand All @@ -96,64 +97,64 @@ void read_tensor(matrix* a, const char* fileName) {

for (int i = 0; i < 225; i++) {
value = strtof(token, NULL);
(a->data)[i] = value;
a[i] = value;
token = strtok(NULL, delimiter);
}
free(line);
fclose(file);
}

__device__ void propagate_fwd(matrix* weights, matrix* input_layer, matrix* output_layer, matrix* biases) {
matrix_mul(weights->data, input_layer->data, output_layer->data, weights->rows, weights->cols);
matrix_add(output_layer->data, biases->data, biases->rows);
__device__ void propagate_fwd(matrix* weights, float* input_layer, float* output_layer, matrix* biases) {
matrix_mul(weights->data, input_layer, output_layer, weights->rows, weights->cols);
matrix_add(output_layer, biases->data, biases->rows);
}

__global__ void infer(matrix** d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
#define BLOCKS 108
#define THREADS_PER_BLOCK 1024

__global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
int in_num) {

__shared__ float sharedInput[225];
float out1[98];
float out2[65];

int num_threads = blockDim.x * gridDim.x;
int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x);

if (thread_idx > it_per_input) return;
float* input = (float*)&d_inputs[in_num * 225];

matrix* input = d_inputs[in_num];

matrix* outputs[2];
outputs[0] = new_matrix(98, 1);
outputs[1] = new_matrix(65, 1);
if (threadIdx.x < 225) {
sharedInput[threadIdx.x] = input[threadIdx.x];
}
__syncthreads();

for (int i = thread_idx; i < it_per_input; i += num_threads) {
propagate_fwd(d_weights[0], input, outputs[0], d_biases[0]);
relu(outputs[0]->data, 98);
propagate_fwd(d_weights[0], sharedInput, out1, d_biases[0]);
relu(out1, 98);

propagate_fwd(d_weights[1], outputs[0], outputs[1], d_biases[1]);
relu(outputs[1]->data, 65);
propagate_fwd(d_weights[1], out1, out2, d_biases[1]);
relu(out2, 65);

propagate_fwd(d_weights[2], outputs[1], outputs[0], d_biases[2]);
relu(outputs[0]->data, 50);
propagate_fwd(d_weights[2], out2, out1, d_biases[2]);
relu(out1, 50);

propagate_fwd(d_weights[3], outputs[0], outputs[1], d_biases[3]);
relu(outputs[1]->data, 30);
propagate_fwd(d_weights[3], out1, out2, d_biases[3]);
relu(out2, 30);

propagate_fwd(d_weights[4], outputs[1], outputs[0], d_biases[4]);
relu(outputs[0]->data, 25);
propagate_fwd(d_weights[4], out2, out1, d_biases[4]);
relu(out1, 25);

propagate_fwd(d_weights[5], outputs[0], outputs[1], d_biases[5]);
relu(outputs[1]->data, 40);
propagate_fwd(d_weights[5], out1, out2, d_biases[5]);
relu(out2, 40);

propagate_fwd(d_weights[6], outputs[1], outputs[0], d_biases[6]);
softmax(outputs[0]->data, 52);
propagate_fwd(d_weights[6], out2, out1, d_biases[6]);
softmax(out1, 52);

int res = argmax(outputs[0]->data, 52);
d_results[in_num] = res;
d_results[in_num] = argmax(out1, 52);
}
free(outputs[0]->data);
free(outputs[0]);
free(outputs[1]->data);
free(outputs[1]);
}

#define IT_PER_IN 1000000

int main(int argc, char* argv[]) {
if (argc < 4) {
printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/> <number_of_inferences>\n");
Expand Down Expand Up @@ -186,8 +187,7 @@ int main(int argc, char* argv[]) {
for (int i = 0; i < NUM_LAYERS; i++) {
matrix* a = copy_to_device(weights[i]);
matrix* b = copy_to_device(biases[i]);
matrix** z = &(d_weights[i]);
CUDA_CHECK(cudaMemcpy(z, &a, sizeof(matrix*), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(&(d_weights[i]), &a, sizeof(matrix*), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(&(d_biases[i]), &b, sizeof(matrix*), cudaMemcpyHostToDevice));
}

Expand All @@ -208,39 +208,82 @@ int main(int argc, char* argv[]) {
}

results = (int*)malloc((input_count) * sizeof(int));
memset(results, 0, sizeof(int) * (input_count));
inputs = (float*)malloc((input_count) * sizeof(float) * 225);

cudaMalloc(&d_results, (input_count) * sizeof(int));
cudaMalloc(&d_inputs, (input_count) * sizeof(matrix*));
cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225);

dir = opendir(directory_path);
while ((entry = readdir(dir)) != NULL) {
if (entry->d_type == DT_REG) {
matrix* input = new_matrix(225, 1);
strcpy(file_num_str, entry->d_name);
file_num_str[strlen(entry->d_name) - 7] = '\0';
file_num = atoi(entry->d_name);
strcpy(file_name, directory_path);
strcat(file_name, "/");
strcat(file_name, entry->d_name);
read_tensor(input, file_name);
matrix* temp = copy_to_device(input);
cudaMemcpy(&d_inputs[file_num - 1], &temp, sizeof(matrix*), cudaMemcpyHostToDevice);
free(input);
read_tensor((float*)&inputs[(file_num - 1) * 225], file_name);
}
}

free(file_name);
free(file_num_str);
closedir(dir);

cudaMemset(d_results, 0, sizeof(int) * input_count);
cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice);

int deviceCount;
cudaError_t err = cudaGetDeviceCount(&deviceCount);
if (err != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(err));
return -1;
}

for (int i = 0; i < deviceCount; ++i) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device %d:\n", i);
printf(" Device Name: %s\n", prop.name);
printf(" Compute Capability: %d.%d\n", prop.major, prop.minor);
printf(" Total Global Memory: %lu bytes\n", prop.totalGlobalMem);
printf(" Shared Memory per Block: %lu bytes\n", prop.sharedMemPerBlock);
printf(" Registers per Block: %d\n", prop.regsPerBlock);
printf(" Warp Size: %d\n", prop.warpSize);
printf(" Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
printf(" Max threads per SM: %d\n", prop.maxThreadsPerMultiProcessor);
printf(" Max Threads Dim: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1],
prop.maxThreadsDim[2]);
printf(" Max Grid Size: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf(" Clock Rate: %d kHz\n", prop.clockRate);
printf(" Total Constant Memory: %lu bytes\n", prop.totalConstMem);
printf(" Multiprocessor Count: %d\n", prop.multiProcessorCount);
printf(" Memory Clock Rate: %d kHz\n", prop.memoryClockRate);
printf(" Memory Bus Width: %d bits\n", prop.memoryBusWidth);
printf(" L2 Cache Size: %d bytes\n", prop.l2CacheSize);
printf("\n");
}

int minGridSize, blockSize;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, infer, 0, 0);
printf("Recommended block size: %d Grid size: %d\n", blockSize, minGridSize);

int it_num = atoi(argv[3]);
struct timeval stop1, start1;
gettimeofday(&start1, NULL);

int iter_per_in = atoi(argv[3]);
cudaDeviceSynchronize();
for (int i = 0; i < input_count; i++) {
infer<<<108, 69>>>(d_inputs, d_results, d_weights, d_biases, iter_per_in, i);
infer<<<BLOCKS, THREADS_PER_BLOCK>>>(d_inputs, d_results, d_weights, d_biases, it_num, i);
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA error: %s\n", cudaGetErrorString(err));
}
}

cudaDeviceSynchronize();

cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost);
gettimeofday(&stop1, NULL);
printf("- Inference: %lu us\n", (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec);

FILE* csv_file = fopen("results.csv", "w+");
fprintf(csv_file, "image_number, guess\n");
Expand All @@ -251,7 +294,7 @@ int main(int argc, char* argv[]) {

// Time taken
gettimeofday(&stop, NULL);
printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
printf("- Total: %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);

return EXIT_SUCCESS;
}
}
15 changes: 7 additions & 8 deletions src/matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ matrix* new_matrix_d(int rows, int cols) {
float* data;
cudaMalloc(&data, rows * cols * sizeof(float));
alloc<<<1, 1>>>(res, data, rows, cols);
cudaDeviceSynchronize();
return res;
}

Expand All @@ -37,7 +36,6 @@ matrix* copy_to_device(matrix* h_mat) {
cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float));
cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice);
alloc<<<1, 1>>>(res, data, h_mat->rows, h_mat->cols);
cudaDeviceSynchronize();
return res;
}

Expand Down Expand Up @@ -89,17 +87,18 @@ static __device__ inline float fastexp(float x) {
}

__device__ void softmax(float* a, int rows) {
float res = (float)0;
for (int i = 0; i < rows; i++) {
res += exp(a[i]);
float sum = 0.0;
for (size_t i = 0; i < rows; i++) {
sum += __expf(a[i]);
}
for (int i = 0; i < rows; i++) {
a[i] /= res;
float t = __logf(sum);
for (size_t i = 0; i < rows; i++) {
a[i] = __expf(a[i] - t);
}
}

__device__ int argmax(float* a, int rows) {
int res = a[0];
float res = a[0];
int idx = 0;
for (int i = 0; i < rows; i++) {
if (res < a[i]) {
Expand Down

0 comments on commit 02dc834

Please sign in to comment.