Skip to content

Commit

Permalink
Remove unnecessary coment
Browse files Browse the repository at this point in the history
  • Loading branch information
nhatdongdang committed Jul 7, 2024
1 parent 9ed1a71 commit d92d08c
Show file tree
Hide file tree
Showing 10 changed files with 192 additions and 12 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ run_test: build
./speed_cpu ./weights_and_biases.txt ./tensors

test: build
./speed_cpu ./weights_and_biases.txt ./tensors 1
./speed_cpu ./weights_and_biases.txt ./tensors 1000000
mv ./results.csv ./test
python3 ./test/verify_csv.py

Expand Down
20 changes: 14 additions & 6 deletions benchmark/matrix_mul/benchmark.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,22 @@ typedef struct {
int cols;
} matrix;

matrix* new_matrix(int rows, int cols) {
matrix* res = (matrix*)malloc(sizeof(matrix));
res->rows = rows;
res->cols = cols;
res->data = (float*)malloc((rows * cols) * sizeof(float));
return res;
matrix* new_matrix_aligned(int rows, int cols) {
// Pad width to fit kernel
int kern_align_f32 = (rows * cols + KERN_COLS - 1) / KERN_COLS * KERN_COLS;

matrix* new_mat = (matrix*)malloc(sizeof(matrix));
new_mat->rows = rows;
new_mat->cols = cols;

// Align entire array for simd access and better cache line utilisation
new_mat->data =
(f32*)aligned_alloc(SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));

return new_mat;
}


int main(int argc, char* argv[]) {
long n = 0;
if (argc > 1) {
Expand Down
Binary file added benchmark/matrix_mul/benchmark.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
76 changes: 76 additions & 0 deletions benchmark/matrix_mul/versions/2x8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include "../matrix_mul.h"

Check notice on line 1 in benchmark/matrix_mul/versions/2x8.c

View workflow job for this annotation

GitHub Actions / cpp-linter

Run clang-format on benchmark/matrix_mul/versions/2x8.c

File benchmark/matrix_mul/versions/2x8.c does not conform to Custom style guidelines. (lines 1, 19, 45, 49, 52, 55, 57, 58, 59, 61, 64, 65, 66, 67, 70, 71)
#include <x86intrin.h>
#include <string.h>

#define KERN_COLS 8
#define VEC_IN_KERN (KERN_COLS / 8)
#define KERN_ROWS 2
#define SIMD_ALGN 32
typedef float f32;
typedef unsigned char u8;

void transpose_mat_inplace(matrix* in) {
int cols_before = in->cols;
int rows_before = in->rows;

// Swapped for transpose
int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS;
int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS;
f32* transposed = (f32*)aligned_alloc(
SIMD_ALGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + 8 - 1) / 8 * 8));
memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32));

for (int row = 0; row < rows_before; row++) {
for (int col = 0; col < cols_before; col++) {
transposed[col * pad_w_width + row] = in->data[row * cols_before + col];
}
}

free(in->data);
in->data = transposed;
// Swap dims
in->cols = pad_w_width;
in->rows = cols_before;
}

matrix* new_matrix_aligned(int rows, int cols) {
// Pad width to fit kernel
int kern_align_f32 = (rows * cols + KERN_COLS - 1) / KERN_COLS * KERN_COLS;

matrix* new_mat = (matrix*)malloc(sizeof(matrix));
new_mat->rows = rows;
new_mat->cols = cols;

// Align entire array for simd access and better cache line utilisation
new_mat->data =
(f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + 8 - 1) / 8 * 8));

return new_mat;
}


void matrix_mul(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_cols, int w_rows){
for (int row = 0; row < w_rows; row += KERN_ROWS) {
for (int col = 0; col < w_cols; col += KERN_COLS) {
int col_bound=col+KERN_COLS;
__m256 res[VEC_IN_KERN];
for (int z=col;z<col_bound;z+=8){
int k=(z-col)/8;
res[k]= _mm256_load_ps(&results[z]);
}
int row_bound=row+KERN_ROWS;
for (int j = row; j < row_bound; j++) {
__m256 is = _mm256_set1_ps(inputs[j]);
for (int z=col;z<col_bound;z+=8){
__m256 wr = _mm256_load_ps(&weights[w_cols*j+ z]);
int k=(z-col)/8;
res[k]= _mm256_fmadd_ps(is,wr,res[k]);
}
}
for (int z=col;z<col_bound;z+=8){
int k=(z-col)/8;
_mm256_store_ps(&results[z], res[k]);
}
}
}
}
77 changes: 77 additions & 0 deletions benchmark/matrix_mul/versions/4x8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#include "../matrix_mul.h"

Check notice on line 1 in benchmark/matrix_mul/versions/4x8.c

View workflow job for this annotation

GitHub Actions / cpp-linter

Run clang-format on benchmark/matrix_mul/versions/4x8.c

File benchmark/matrix_mul/versions/4x8.c does not conform to Custom style guidelines. (lines 1, 20, 21, 46, 50, 53, 56, 58, 59, 60, 62, 65, 66, 67, 68, 71, 72)
#include <x86intrin.h>
#include <string.h>

#define KERN_COLS 8
#define KERN_ROWS 4
#define SIMD_ALIGN 32
#define SIMD_ALIGN_F32 (SIMD_ALIGN / 4) // f32 is 4 bytes

typedef float f32;
typedef unsigned char u8;

void transpose_mat_inplace(matrix* in) {
int cols_before = in->cols;
int rows_before = in->rows;

// Swapped for transpose
int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS;
int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS;
f32* transposed = (f32*)aligned_alloc(
SIMD_ALIGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32));

for (int row = 0; row < rows_before; row++) {
for (int col = 0; col < cols_before; col++) {
transposed[col * pad_w_width + row] = in->data[row * cols_before + col];
}
}

free(in->data);
in->data = transposed;
// Swap dims
in->cols = pad_w_width;
in->rows = cols_before;
}

matrix* new_matrix_aligned(int rows, int cols) {
// Pad width to fit kernel
int kern_align_f32 = (rows * cols + KERN_COLS - 1) / KERN_COLS * KERN_COLS;

matrix* new_mat = (matrix*)malloc(sizeof(matrix));
new_mat->rows = rows;
new_mat->cols = cols;

// Align entire array for simd access and better cache line utilisation
new_mat->data =
(f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + 8 - 1) / 8 * 8));

return new_mat;
}


void matrix_mul(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_cols, int w_rows){
for (int row = 0; row < w_rows; row += KERN_ROWS) {
for (int col = 0; col < w_cols; col += KERN_COLS) {
int col_bound=col+KERN_COLS;
__m256 res[VEC_IN_KERN];
for (int z=col;z<col_bound;z+=8){
int k=(z-col)/8;
res[k]= _mm256_load_ps(&results[z]);
}
int row_bound=row+KERN_ROWS;
for (int j = row; j < row_bound; j++) {
__m256 is = _mm256_set1_ps(inputs[j]);
for (int z=col;z<col_bound;z+=8){
__m256 wr = _mm256_load_ps(&weights[w_cols*j+ z]);
int k=(z-col)/8;
res[k]= _mm256_fmadd_ps(is,wr,res[k]);
}
}
for (int z=col;z<col_bound;z+=8){
int k=(z-col)/8;
_mm256_store_ps(&results[z], res[k]);
}
}
}
}
Binary file added benchmark/softmax/benchmark.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
15 changes: 15 additions & 0 deletions benchmark/softmax/versions/softmax_v2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#include "../softmax.h"

Check notice on line 1 in benchmark/softmax/versions/softmax_v2.c

View workflow job for this annotation

GitHub Actions / cpp-linter

Run clang-format on benchmark/softmax/versions/softmax_v2.c

File benchmark/softmax/versions/softmax_v2.c does not conform to Custom style guidelines. (lines 2, 11, 13)
#include "math.h"


void softmax(float* dest, int len) {
float res = 0.0f;
for (int i = 0; i < len; i++) {
res += expf(dest[i]);
}

float t= logf(res);
for (int i = 0; i < len; i++) {
dest[i] = expf(dest[i]-t);
}
}
10 changes: 7 additions & 3 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ void propagate_fwd(const matrix* weights, const vector* inputs, vector* results,

// Basic version, too many aligned_alloc
u8 infer(vector* input) {
f32 temp1[104];
f32 temp2[72];
memset(temp1,0,sizeof(f32)*104);
memset(temp2,0,sizeof(f32)*72);
vector* outputs[NUM_LAYERS];
outputs[0] = new_vec_aligned(98);
outputs[1] = new_vec_aligned(65);
Expand Down Expand Up @@ -129,7 +133,7 @@ int main(int argc, char* argv[]) {
printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/> <number_of_inferences>\n");
return EXIT_FAILURE;
}

// Start timing
struct timeval stop, start, preinf;
gettimeofday(&start, NULL);
Expand Down Expand Up @@ -195,7 +199,7 @@ int main(int argc, char* argv[]) {
// int NUM_THREADS = sysconf(_SC_NPROCESSORS_ONLN);

if (iter_per_in > 1)
#pragma omp parallel
// #pragma omp parallel
{
int force = 0;
u8* results_local = (u8*)malloc(input_count * sizeof(u8));
Expand All @@ -206,7 +210,7 @@ int main(int argc, char* argv[]) {
vector* input = new_vec_aligned(TENSOR_SIZE);
memcpy(input->data, (f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32));

#pragma omp for
// #pragma omp for
for (int j = 0; j < iter_per_in - 1; j++) {
// Using global memory for model seems to be faster
results_local[i] = infer_reuse_layers_thread(input, weights, biases);
Expand Down
2 changes: 1 addition & 1 deletion src/matrix.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ static void kernel(const float* in, const float* wg, float* rs, int start_row, i
// Ver. Artemis Rosman
// W rows and W width is expected to be for the column major matrix, i.e. len of
// in vec = w_rows, len of out vec = w_cols
void sgemv_t_tuned(const float* weights, const float* inputs, float* __restrict__ results, int w_width, int w_rows) {
void sgemv_t_tuned(const float* __restrict__ weights, const float* __restrict__ inputs, float* __restrict__ results, int w_width, int w_rows) {
// Perform mult using kernel
for (int row = 0; row < w_rows; row += KERN_ROWS) {
for (int col = 0; col < w_width; col += KERN_COLS) {
Expand Down
2 changes: 1 addition & 1 deletion src/matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ typedef signed long i64;
#define KERN_COLS 8
#define KERN_ROWS 4
#define SIMD_ALIGN 32
#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32)) // f32 is 4 bytes
#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32))

typedef struct vector {
int len;
Expand Down

0 comments on commit d92d08c

Please sign in to comment.