-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9ed1a71
commit d92d08c
Showing
10 changed files
with
192 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#include "../matrix_mul.h" | ||
Check notice on line 1 in benchmark/matrix_mul/versions/2x8.c GitHub Actions / cpp-linterRun clang-format on benchmark/matrix_mul/versions/2x8.c
|
||
#include <x86intrin.h> | ||
#include <string.h> | ||
|
||
#define KERN_COLS 8 | ||
#define VEC_IN_KERN (KERN_COLS / 8) | ||
#define KERN_ROWS 2 | ||
#define SIMD_ALGN 32 | ||
typedef float f32; | ||
typedef unsigned char u8; | ||
|
||
void transpose_mat_inplace(matrix* in) { | ||
int cols_before = in->cols; | ||
int rows_before = in->rows; | ||
|
||
// Swapped for transpose | ||
int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS; | ||
int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS; | ||
f32* transposed = (f32*)aligned_alloc( | ||
SIMD_ALGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + 8 - 1) / 8 * 8)); | ||
memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32)); | ||
|
||
for (int row = 0; row < rows_before; row++) { | ||
for (int col = 0; col < cols_before; col++) { | ||
transposed[col * pad_w_width + row] = in->data[row * cols_before + col]; | ||
} | ||
} | ||
|
||
free(in->data); | ||
in->data = transposed; | ||
// Swap dims | ||
in->cols = pad_w_width; | ||
in->rows = cols_before; | ||
} | ||
|
||
matrix* new_matrix_aligned(int rows, int cols) { | ||
// Pad width to fit kernel | ||
int kern_align_f32 = (rows * cols + KERN_COLS - 1) / KERN_COLS * KERN_COLS; | ||
|
||
matrix* new_mat = (matrix*)malloc(sizeof(matrix)); | ||
new_mat->rows = rows; | ||
new_mat->cols = cols; | ||
|
||
// Align entire array for simd access and better cache line utilisation | ||
new_mat->data = | ||
(f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + 8 - 1) / 8 * 8)); | ||
|
||
return new_mat; | ||
} | ||
|
||
|
||
void matrix_mul(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_cols, int w_rows){ | ||
for (int row = 0; row < w_rows; row += KERN_ROWS) { | ||
for (int col = 0; col < w_cols; col += KERN_COLS) { | ||
int col_bound=col+KERN_COLS; | ||
__m256 res[VEC_IN_KERN]; | ||
for (int z=col;z<col_bound;z+=8){ | ||
int k=(z-col)/8; | ||
res[k]= _mm256_load_ps(&results[z]); | ||
} | ||
int row_bound=row+KERN_ROWS; | ||
for (int j = row; j < row_bound; j++) { | ||
__m256 is = _mm256_set1_ps(inputs[j]); | ||
for (int z=col;z<col_bound;z+=8){ | ||
__m256 wr = _mm256_load_ps(&weights[w_cols*j+ z]); | ||
int k=(z-col)/8; | ||
res[k]= _mm256_fmadd_ps(is,wr,res[k]); | ||
} | ||
} | ||
for (int z=col;z<col_bound;z+=8){ | ||
int k=(z-col)/8; | ||
_mm256_store_ps(&results[z], res[k]); | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#include "../matrix_mul.h" | ||
Check notice on line 1 in benchmark/matrix_mul/versions/4x8.c GitHub Actions / cpp-linterRun clang-format on benchmark/matrix_mul/versions/4x8.c
|
||
#include <x86intrin.h> | ||
#include <string.h> | ||
|
||
#define KERN_COLS 8 | ||
#define KERN_ROWS 4 | ||
#define SIMD_ALIGN 32 | ||
#define SIMD_ALIGN_F32 (SIMD_ALIGN / 4) // f32 is 4 bytes | ||
|
||
typedef float f32; | ||
typedef unsigned char u8; | ||
|
||
void transpose_mat_inplace(matrix* in) { | ||
int cols_before = in->cols; | ||
int rows_before = in->rows; | ||
|
||
// Swapped for transpose | ||
int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS; | ||
int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS; | ||
f32* transposed = (f32*)aligned_alloc( | ||
SIMD_ALIGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32)); | ||
memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32)); | ||
|
||
for (int row = 0; row < rows_before; row++) { | ||
for (int col = 0; col < cols_before; col++) { | ||
transposed[col * pad_w_width + row] = in->data[row * cols_before + col]; | ||
} | ||
} | ||
|
||
free(in->data); | ||
in->data = transposed; | ||
// Swap dims | ||
in->cols = pad_w_width; | ||
in->rows = cols_before; | ||
} | ||
|
||
matrix* new_matrix_aligned(int rows, int cols) { | ||
// Pad width to fit kernel | ||
int kern_align_f32 = (rows * cols + KERN_COLS - 1) / KERN_COLS * KERN_COLS; | ||
|
||
matrix* new_mat = (matrix*)malloc(sizeof(matrix)); | ||
new_mat->rows = rows; | ||
new_mat->cols = cols; | ||
|
||
// Align entire array for simd access and better cache line utilisation | ||
new_mat->data = | ||
(f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + 8 - 1) / 8 * 8)); | ||
|
||
return new_mat; | ||
} | ||
|
||
|
||
void matrix_mul(const f32* weights, const f32* inputs, f32* __restrict__ results, int w_cols, int w_rows){ | ||
for (int row = 0; row < w_rows; row += KERN_ROWS) { | ||
for (int col = 0; col < w_cols; col += KERN_COLS) { | ||
int col_bound=col+KERN_COLS; | ||
__m256 res[VEC_IN_KERN]; | ||
for (int z=col;z<col_bound;z+=8){ | ||
int k=(z-col)/8; | ||
res[k]= _mm256_load_ps(&results[z]); | ||
} | ||
int row_bound=row+KERN_ROWS; | ||
for (int j = row; j < row_bound; j++) { | ||
__m256 is = _mm256_set1_ps(inputs[j]); | ||
for (int z=col;z<col_bound;z+=8){ | ||
__m256 wr = _mm256_load_ps(&weights[w_cols*j+ z]); | ||
int k=(z-col)/8; | ||
res[k]= _mm256_fmadd_ps(is,wr,res[k]); | ||
} | ||
} | ||
for (int z=col;z<col_bound;z+=8){ | ||
int k=(z-col)/8; | ||
_mm256_store_ps(&results[z], res[k]); | ||
} | ||
} | ||
} | ||
} |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#include "../softmax.h" | ||
#include "math.h" | ||
|
||
|
||
void softmax(float* dest, int len) { | ||
float res = 0.0f; | ||
for (int i = 0; i < len; i++) { | ||
res += expf(dest[i]); | ||
} | ||
|
||
float t= logf(res); | ||
for (int i = 0; i < len; i++) { | ||
dest[i] = expf(dest[i]-t); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters