Skip to content

Commit

Permalink
Vectorizing sgemv for multiples of 4 with SSE
Browse files Browse the repository at this point in the history
  • Loading branch information
jmvalin committed Nov 3, 2023
1 parent 166a6c8 commit 1ada7d4
Showing 1 changed file with 17 additions and 0 deletions.
17 changes: 17 additions & 0 deletions dnn/vec_avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,23 @@ static inline void sgemv(float *out, const float *weights, int rows, int cols, i
}
_mm256_storeu_ps (&y[0], vy0);
}
for (;i<rows-3;i+=4)
{
float *y;
__m128 vy0;
y = &out[i];
vy0 = _mm_setzero_ps();
for (j=0;j<cols;j++)
{
__m128 vxj;
__m128 vw;
vxj = _mm_broadcast_ss(&x[j]);

vw = _mm_loadu_ps(&weights[j*col_stride + i]);
vy0 = _mm_fmadd_ps(vw, vxj, vy0);
}
_mm_storeu_ps (&y[0], vy0);
}
for (;i<rows;i++)
{
out[i] = 0;
Expand Down

0 comments on commit 1ada7d4

Please sign in to comment.