diff --git a/cmake/gen/hvx_microkernels.cmake b/cmake/gen/hvx_microkernels.cmake index 34b5aad873c..2de8ccdc47b 100644 --- a/cmake/gen/hvx_microkernels.cmake +++ b/cmake/gen/hvx_microkernels.cmake @@ -106,6 +106,13 @@ SET(NON_PROD_HVX_MICROKERNEL_SRCS src/qs8-vadd/gen/qs8-vadd-minmax-hvx-u32.c src/qs8-vadd/gen/qs8-vadd-minmax-hvx-u64.c src/qs8-vadd/gen/qs8-vadd-minmax-hvx-u96.c - src/qs8-vadd/gen/qs8-vadd-minmax-hvx-u128.c) + src/qs8-vadd/gen/qs8-vadd-minmax-hvx-u128.c + src/x32-transposec/gen/x32-transposec-2x32-multi-multi-hvx.c + src/x32-transposec/gen/x32-transposec-4x32-multi-multi-hvx.c + src/x32-transposec/gen/x32-transposec-8x32-multi-multi-hvx.c + src/x32-transposec/gen/x32-transposec-16x32-multi-multi-hvx.c + src/x32-transposec/gen/x32-transposec-32x32-multi-mov-hvx.c + src/x32-transposec/gen/x32-transposec-32x32-multi-multi-hvx.c + src/x32-transposec/gen/x32-transposec-32x32-multi-switch-hvx.c) SET(ALL_HVX_MICROKERNEL_SRCS ${PROD_HVX_MICROKERNEL_SRCS} + ${NON_PROD_HVX_MICROKERNEL_SRCS}) diff --git a/gen/hvx_microkernels.bzl b/gen/hvx_microkernels.bzl index 691a12da673..c679eba085d 100644 --- a/gen/hvx_microkernels.bzl +++ b/gen/hvx_microkernels.bzl @@ -104,6 +104,13 @@ NON_PROD_HVX_MICROKERNEL_SRCS = [ "src/qs8-vadd/gen/qs8-vadd-minmax-hvx-u64.c", "src/qs8-vadd/gen/qs8-vadd-minmax-hvx-u96.c", "src/qs8-vadd/gen/qs8-vadd-minmax-hvx-u128.c", + "src/x32-transposec/gen/x32-transposec-2x32-multi-multi-hvx.c", + "src/x32-transposec/gen/x32-transposec-4x32-multi-multi-hvx.c", + "src/x32-transposec/gen/x32-transposec-8x32-multi-multi-hvx.c", + "src/x32-transposec/gen/x32-transposec-16x32-multi-multi-hvx.c", + "src/x32-transposec/gen/x32-transposec-32x32-multi-mov-hvx.c", + "src/x32-transposec/gen/x32-transposec-32x32-multi-multi-hvx.c", + "src/x32-transposec/gen/x32-transposec-32x32-multi-switch-hvx.c", ] ALL_HVX_MICROKERNEL_SRCS = PROD_HVX_MICROKERNEL_SRCS + NON_PROD_HVX_MICROKERNEL_SRCS diff --git a/scripts/generate-xN-transpose.sh b/scripts/generate-xN-transpose.sh index fdb6bafc3ca..0ca05499a1f 100755 --- a/scripts/generate-xN-transpose.sh +++ b/scripts/generate-xN-transpose.sh @@ -171,4 +171,13 @@ tools/xngen src/x32-transposec/wasmsimd.c.in -D SIZE=32 IN_PTRS=MULTI OUT_PTRS=S tools/xngen src/x32-transposec/wasmsimd.c.in -D SIZE=32 IN_PTRS=MULTI OUT_PTRS=MULTI -o src/x32-transposec/gen/x32-transposec-4x4-multi-multi-wasmsimd.c & tools/xngen src/x32-transposec/wasmsimd.c.in -D SIZE=32 IN_PTRS=MULTI OUT_PTRS=MOV -o src/x32-transposec/gen/x32-transposec-4x4-multi-mov-wasmsimd.c & +#################################### HEXAGON HVX ############################### +tools/xngen src/x32-transposec/hvx.c.in -D SIZE=32 IN_PTRS=MULTI OUT_PTRS=MULTI TILE_HEIGHT=2 -o src/x32-transposec/gen/x32-transposec-2x32-multi-multi-hvx.c & +tools/xngen src/x32-transposec/hvx.c.in -D SIZE=32 IN_PTRS=MULTI OUT_PTRS=MULTI TILE_HEIGHT=4 -o src/x32-transposec/gen/x32-transposec-4x32-multi-multi-hvx.c & +tools/xngen src/x32-transposec/hvx.c.in -D SIZE=32 IN_PTRS=MULTI OUT_PTRS=MULTI TILE_HEIGHT=8 -o src/x32-transposec/gen/x32-transposec-8x32-multi-multi-hvx.c & +tools/xngen src/x32-transposec/hvx.c.in -D SIZE=32 IN_PTRS=MULTI OUT_PTRS=MULTI TILE_HEIGHT=16 -o src/x32-transposec/gen/x32-transposec-16x32-multi-multi-hvx.c & +tools/xngen src/x32-transposec/hvx.c.in -D SIZE=32 IN_PTRS=MULTI OUT_PTRS=MULTI TILE_HEIGHT=32 -o src/x32-transposec/gen/x32-transposec-32x32-multi-multi-hvx.c & +tools/xngen src/x32-transposec/hvx.c.in -D SIZE=32 IN_PTRS=MULTI OUT_PTRS=SWITCH TILE_HEIGHT=32 -o src/x32-transposec/gen/x32-transposec-32x32-multi-switch-hvx.c & +tools/xngen src/x32-transposec/hvx.c.in -D SIZE=32 IN_PTRS=MULTI OUT_PTRS=MOV TILE_HEIGHT=32 -o src/x32-transposec/gen/x32-transposec-32x32-multi-mov-hvx.c & + wait diff --git a/src/x32-transposec/gen/x32-transposec-16x32-multi-multi-hvx.c b/src/x32-transposec/gen/x32-transposec-16x32-multi-multi-hvx.c new file mode 100644 index 00000000000..09babe398ed --- /dev/null +++ b/src/x32-transposec/gen/x32-transposec-16x32-multi-multi-hvx.c @@ -0,0 +1,240 @@ +// Auto-generated file. Do not edit! +// Template: src/x32-transposec/hvx.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/math.h" +#include "xnnpack/transpose.h" + +void xnn_x32_transposec_ukernel__16x32_multi_multi_hvx( + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) XNN_OOB_READS +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint32_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint32_t)); + + const size_t tile_height = 16; + const size_t tile_width = 32; + const size_t tile_hbytes = tile_height * sizeof(uint32_t); + const size_t tile_wbytes = tile_width * sizeof(uint32_t); + const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; + const size_t input_offset = tile_height * input_stride; + const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t); + + const uint32_t* i0 = input; + const uint32_t* i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + const uint32_t* i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + const uint32_t* i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + const uint32_t* i4 = (const uint32_t*) ((uintptr_t) i3 + input_stride); + const uint32_t* i5 = (const uint32_t*) ((uintptr_t) i4 + input_stride); + const uint32_t* i6 = (const uint32_t*) ((uintptr_t) i5 + input_stride); + const uint32_t* i7 = (const uint32_t*) ((uintptr_t) i6 + input_stride); + const uint32_t* i8 = (const uint32_t*) ((uintptr_t) i7 + input_stride); + const uint32_t* i9 = (const uint32_t*) ((uintptr_t) i8 + input_stride); + const uint32_t* i10 = (const uint32_t*) ((uintptr_t) i9 + input_stride); + const uint32_t* i11 = (const uint32_t*) ((uintptr_t) i10 + input_stride); + const uint32_t* i12 = (const uint32_t*) ((uintptr_t) i11 + input_stride); + const uint32_t* i13 = (const uint32_t*) ((uintptr_t) i12 + input_stride); + const uint32_t* i14 = (const uint32_t*) ((uintptr_t) i13 + input_stride); + const uint32_t* i15 = (const uint32_t*) ((uintptr_t) i14 + input_stride); + uint32_t* o0 = (uint32_t*) output; + uint32_t* o1 = (uint32_t*) ((uintptr_t) o0 + output_stride); + uint32_t* o2 = (uint32_t*) ((uintptr_t) o1 + output_stride); + uint32_t* o3 = (uint32_t*) ((uintptr_t) o2 + output_stride); + uint32_t* o4 = (uint32_t*) ((uintptr_t) o3 + output_stride); + uint32_t* o5 = (uint32_t*) ((uintptr_t) o4 + output_stride); + uint32_t* o6 = (uint32_t*) ((uintptr_t) o5 + output_stride); + uint32_t* o7 = (uint32_t*) ((uintptr_t) o6 + output_stride); + uint32_t* o8 = (uint32_t*) ((uintptr_t) o7 + output_stride); + uint32_t* o9 = (uint32_t*) ((uintptr_t) o8 + output_stride); + uint32_t* o10 = (uint32_t*) ((uintptr_t) o9 + output_stride); + uint32_t* o11 = (uint32_t*) ((uintptr_t) o10 + output_stride); + uint32_t* o12 = (uint32_t*) ((uintptr_t) o11 + output_stride); + uint32_t* o13 = (uint32_t*) ((uintptr_t) o12 + output_stride); + uint32_t* o14 = (uint32_t*) ((uintptr_t) o13 + output_stride); + uint32_t* o15 = (uint32_t*) ((uintptr_t) o14 + output_stride); + const size_t minus_output_stride = -output_stride; + + do { + if XNN_UNPREDICTABLE(block_width < 2) { + o1 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 2) { + o2 = o0; + } + if XNN_UNPREDICTABLE(block_width < 4) { + o3 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 4) { + o4 = o0; + } + if XNN_UNPREDICTABLE(block_width < 6) { + o5 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 6) { + o6 = o0; + } + if XNN_UNPREDICTABLE(block_width < 8) { + o7 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 8) { + o8 = o0; + } + if XNN_UNPREDICTABLE(block_width < 10) { + o9 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 10) { + o10 = o0; + } + if XNN_UNPREDICTABLE(block_width < 12) { + o11 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 12) { + o12 = o0; + } + if XNN_UNPREDICTABLE(block_width < 14) { + o13 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 14) { + o14 = o0; + } + if XNN_UNPREDICTABLE(block_width < 16) { + o15 = o0; + } + size_t bh = block_height; + for (; bh >= 16; bh -= 16) { + const HVX_Vector v4_0 = *((HVX_UVector *) i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); + const HVX_Vector v4_1 = *((HVX_UVector *) i1); i1 = (uint32_t*) ((uintptr_t) i1 + input_offset); + const HVX_Vector v4_2 = *((HVX_UVector *) i2); i2 = (uint32_t*) ((uintptr_t) i2 + input_offset); + const HVX_Vector v4_3 = *((HVX_UVector *) i3); i3 = (uint32_t*) ((uintptr_t) i3 + input_offset); + const HVX_Vector v4_4 = *((HVX_UVector *) i4); i4 = (uint32_t*) ((uintptr_t) i4 + input_offset); + const HVX_Vector v4_5 = *((HVX_UVector *) i5); i5 = (uint32_t*) ((uintptr_t) i5 + input_offset); + const HVX_Vector v4_6 = *((HVX_UVector *) i6); i6 = (uint32_t*) ((uintptr_t) i6 + input_offset); + const HVX_Vector v4_7 = *((HVX_UVector *) i7); i7 = (uint32_t*) ((uintptr_t) i7 + input_offset); + const HVX_Vector v4_8 = *((HVX_UVector *) i8); i8 = (uint32_t*) ((uintptr_t) i8 + input_offset); + const HVX_Vector v4_9 = *((HVX_UVector *) i9); i9 = (uint32_t*) ((uintptr_t) i9 + input_offset); + const HVX_Vector v4_10 = *((HVX_UVector *) i10); i10 = (uint32_t*) ((uintptr_t) i10 + input_offset); + const HVX_Vector v4_11 = *((HVX_UVector *) i11); i11 = (uint32_t*) ((uintptr_t) i11 + input_offset); + const HVX_Vector v4_12 = *((HVX_UVector *) i12); i12 = (uint32_t*) ((uintptr_t) i12 + input_offset); + const HVX_Vector v4_13 = *((HVX_UVector *) i13); i13 = (uint32_t*) ((uintptr_t) i13 + input_offset); + const HVX_Vector v4_14 = *((HVX_UVector *) i14); i14 = (uint32_t*) ((uintptr_t) i14 + input_offset); + const HVX_Vector v4_15 = *((HVX_UVector *) i15); i15 = (uint32_t*) ((uintptr_t) i15 + input_offset); + + int rt = -4; + const HVX_VectorPair v3_0 = Q6_W_vshuff_VVR(v4_1, v4_0, rt); + const HVX_VectorPair v3_1 = Q6_W_vshuff_VVR(v4_3, v4_2, rt); + const HVX_VectorPair v3_2 = Q6_W_vshuff_VVR(v4_5, v4_4, rt); + const HVX_VectorPair v3_3 = Q6_W_vshuff_VVR(v4_7, v4_6, rt); + const HVX_VectorPair v3_4 = Q6_W_vshuff_VVR(v4_9, v4_8, rt); + const HVX_VectorPair v3_5 = Q6_W_vshuff_VVR(v4_11, v4_10, rt); + const HVX_VectorPair v3_6 = Q6_W_vshuff_VVR(v4_13, v4_12, rt); + const HVX_VectorPair v3_7 = Q6_W_vshuff_VVR(v4_15, v4_14, rt); + + rt = rt << 1; + HVX_VectorPair v2_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_1), Q6_V_lo_W(v3_0), rt); + HVX_VectorPair v2_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_1), Q6_V_hi_W(v3_0), rt); + + HVX_VectorPair v2_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_3), Q6_V_lo_W(v3_2), rt); + HVX_VectorPair v2_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_3), Q6_V_hi_W(v3_2), rt); + + HVX_VectorPair v2_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_5), Q6_V_lo_W(v3_4), rt); + HVX_VectorPair v2_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_5), Q6_V_hi_W(v3_4), rt); + + HVX_VectorPair v2_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_7), Q6_V_lo_W(v3_6), rt); + HVX_VectorPair v2_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_7), Q6_V_hi_W(v3_6), rt); + + rt = rt << 1; + HVX_VectorPair v1_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_2), Q6_V_lo_W(v2_0), rt); + HVX_VectorPair v1_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_2), Q6_V_hi_W(v2_0), rt); + + HVX_VectorPair v1_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_3), Q6_V_lo_W(v2_1), rt); + HVX_VectorPair v1_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_3), Q6_V_hi_W(v2_1), rt); + + HVX_VectorPair v1_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_6), Q6_V_lo_W(v2_4), rt); + HVX_VectorPair v1_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_6), Q6_V_hi_W(v2_4), rt); + + HVX_VectorPair v1_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_7), Q6_V_lo_W(v2_5), rt); + HVX_VectorPair v1_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_7), Q6_V_hi_W(v2_5), rt); + + rt = rt << 1; + HVX_VectorPair v0_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_4), Q6_V_lo_W(v1_0), rt); + HVX_VectorPair v0_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_4), Q6_V_hi_W(v1_0), rt); + + HVX_VectorPair v0_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_5), Q6_V_lo_W(v1_1), rt); + HVX_VectorPair v0_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_5), Q6_V_hi_W(v1_1), rt); + + HVX_VectorPair v0_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_6), Q6_V_lo_W(v1_2), rt); + HVX_VectorPair v0_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_6), Q6_V_hi_W(v1_2), rt); + + HVX_VectorPair v0_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_7), Q6_V_lo_W(v1_3), rt); + HVX_VectorPair v0_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_7), Q6_V_hi_W(v1_3), rt); + + xnn_storeu_f32(o15, Q6_V_hi_W(v0_7)); o15 = (uint32_t*) ((uintptr_t) o15 + tile_hbytes); + xnn_storeu_f32(o14, Q6_V_lo_W(v0_7)); o14 = (uint32_t*) ((uintptr_t) o14 + tile_hbytes); + xnn_storeu_f32(o13, Q6_V_hi_W(v0_6)); o13 = (uint32_t*) ((uintptr_t) o13 + tile_hbytes); + xnn_storeu_f32(o12, Q6_V_lo_W(v0_6)); o12 = (uint32_t*) ((uintptr_t) o12 + tile_hbytes); + xnn_storeu_f32(o11, Q6_V_hi_W(v0_5)); o11 = (uint32_t*) ((uintptr_t) o11 + tile_hbytes); + xnn_storeu_f32(o10, Q6_V_lo_W(v0_5)); o10 = (uint32_t*) ((uintptr_t) o10 + tile_hbytes); + xnn_storeu_f32(o9, Q6_V_hi_W(v0_4)); o9 = (uint32_t*) ((uintptr_t) o9 + tile_hbytes); + xnn_storeu_f32(o8, Q6_V_lo_W(v0_4)); o8 = (uint32_t*) ((uintptr_t) o8 + tile_hbytes); + xnn_storeu_f32(o7, Q6_V_hi_W(v0_3)); o7 = (uint32_t*) ((uintptr_t) o7 + tile_hbytes); + xnn_storeu_f32(o6, Q6_V_lo_W(v0_3)); o6 = (uint32_t*) ((uintptr_t) o6 + tile_hbytes); + xnn_storeu_f32(o5, Q6_V_hi_W(v0_2)); o5 = (uint32_t*) ((uintptr_t) o5 + tile_hbytes); + xnn_storeu_f32(o4, Q6_V_lo_W(v0_2)); o4 = (uint32_t*) ((uintptr_t) o4 + tile_hbytes); + xnn_storeu_f32(o3, Q6_V_hi_W(v0_1)); o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); + xnn_storeu_f32(o2, Q6_V_lo_W(v0_1)); o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); + xnn_storeu_f32(o1, Q6_V_hi_W(v0_0)); o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); + xnn_storeu_f32(o0, Q6_V_lo_W(v0_0)); o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); + } + + + i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); + i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + i4 = (const uint32_t*) ((uintptr_t) i3 + input_stride); + i5 = (const uint32_t*) ((uintptr_t) i4 + input_stride); + i6 = (const uint32_t*) ((uintptr_t) i5 + input_stride); + i7 = (const uint32_t*) ((uintptr_t) i6 + input_stride); + i8 = (const uint32_t*) ((uintptr_t) i7 + input_stride); + i9 = (const uint32_t*) ((uintptr_t) i8 + input_stride); + i10 = (const uint32_t*) ((uintptr_t) i9 + input_stride); + i11 = (const uint32_t*) ((uintptr_t) i10 + input_stride); + i12 = (const uint32_t*) ((uintptr_t) i11 + input_stride); + i13 = (const uint32_t*) ((uintptr_t) i12 + input_stride); + i14 = (const uint32_t*) ((uintptr_t) i13 + input_stride); + i15 = (const uint32_t*) ((uintptr_t) i14 + input_stride); + o0 = (uint32_t*) ((uintptr_t) o0 + output_reset); + o1 = (uint32_t*) ((uintptr_t) o1 + output_reset); + o2 = (uint32_t*) ((uintptr_t) o2 + output_reset); + o3 = (uint32_t*) ((uintptr_t) o3 + output_reset); + o4 = (uint32_t*) ((uintptr_t) o4 + output_reset); + o5 = (uint32_t*) ((uintptr_t) o5 + output_reset); + o6 = (uint32_t*) ((uintptr_t) o6 + output_reset); + o7 = (uint32_t*) ((uintptr_t) o7 + output_reset); + o8 = (uint32_t*) ((uintptr_t) o8 + output_reset); + o9 = (uint32_t*) ((uintptr_t) o9 + output_reset); + o10 = (uint32_t*) ((uintptr_t) o10 + output_reset); + o11 = (uint32_t*) ((uintptr_t) o11 + output_reset); + o12 = (uint32_t*) ((uintptr_t) o12 + output_reset); + o13 = (uint32_t*) ((uintptr_t) o13 + output_reset); + o14 = (uint32_t*) ((uintptr_t) o14 + output_reset); + o15 = (uint32_t*) ((uintptr_t) o15 + output_reset); + block_width = doz(block_width, tile_width); + } while (block_width != 0); +} + + diff --git a/src/x32-transposec/gen/x32-transposec-2x32-multi-multi-hvx.c b/src/x32-transposec/gen/x32-transposec-2x32-multi-multi-hvx.c new file mode 100644 index 00000000000..f3f4d8f93be --- /dev/null +++ b/src/x32-transposec/gen/x32-transposec-2x32-multi-multi-hvx.c @@ -0,0 +1,68 @@ +// Auto-generated file. Do not edit! +// Template: src/x32-transposec/hvx.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/math.h" +#include "xnnpack/transpose.h" + +void xnn_x32_transposec_ukernel__2x32_multi_multi_hvx( + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) XNN_OOB_READS +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint32_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint32_t)); + + const size_t tile_height = 2; + const size_t tile_width = 32; + const size_t tile_hbytes = tile_height * sizeof(uint32_t); + const size_t tile_wbytes = tile_width * sizeof(uint32_t); + const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; + const size_t input_offset = tile_height * input_stride; + const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t); + + const uint32_t* i0 = input; + const uint32_t* i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + uint32_t* o0 = (uint32_t*) output; + uint32_t* o1 = (uint32_t*) ((uintptr_t) o0 + output_stride); + const size_t minus_output_stride = -output_stride; + + do { + if XNN_UNPREDICTABLE(block_width < 2) { + o1 = o0; + } + size_t bh = block_height; + for (; bh >= 2; bh -= 2) { + const HVX_Vector v1_0 = *((HVX_UVector *) i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); + const HVX_Vector v1_1 = *((HVX_UVector *) i1); i1 = (uint32_t*) ((uintptr_t) i1 + input_offset); + + int rt = -4; + const HVX_VectorPair v0_0 = Q6_W_vshuff_VVR(v1_1, v1_0, rt); + + xnn_storeu_f32(o1, Q6_V_hi_W(v0_0)); o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); + xnn_storeu_f32(o0, Q6_V_lo_W(v0_0)); o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); + } + + + i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); + i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + o0 = (uint32_t*) ((uintptr_t) o0 + output_reset); + o1 = (uint32_t*) ((uintptr_t) o1 + output_reset); + block_width = doz(block_width, tile_width); + } while (block_width != 0); +} + + diff --git a/src/x32-transposec/gen/x32-transposec-32x32-multi-mov-hvx.c b/src/x32-transposec/gen/x32-transposec-32x32-multi-mov-hvx.c new file mode 100644 index 00000000000..9d3f4207c52 --- /dev/null +++ b/src/x32-transposec/gen/x32-transposec-32x32-multi-mov-hvx.c @@ -0,0 +1,427 @@ +// Auto-generated file. Do not edit! +// Template: src/x32-transposec/hvx.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/math.h" +#include "xnnpack/transpose.h" + +void xnn_x32_transposec_ukernel__32x32_multi_mov_hvx( + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) XNN_OOB_READS +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint32_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint32_t)); + + const size_t tile_height = 32; + const size_t tile_width = 32; + const size_t tile_hbytes = tile_height * sizeof(uint32_t); + const size_t tile_wbytes = tile_width * sizeof(uint32_t); + const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; + const size_t input_offset = tile_height * input_stride; + const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t) - tile_hbytes; + + const uint32_t* i0 = input; + const uint32_t* i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + const uint32_t* i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + const uint32_t* i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + const uint32_t* i4 = (const uint32_t*) ((uintptr_t) i3 + input_stride); + const uint32_t* i5 = (const uint32_t*) ((uintptr_t) i4 + input_stride); + const uint32_t* i6 = (const uint32_t*) ((uintptr_t) i5 + input_stride); + const uint32_t* i7 = (const uint32_t*) ((uintptr_t) i6 + input_stride); + const uint32_t* i8 = (const uint32_t*) ((uintptr_t) i7 + input_stride); + const uint32_t* i9 = (const uint32_t*) ((uintptr_t) i8 + input_stride); + const uint32_t* i10 = (const uint32_t*) ((uintptr_t) i9 + input_stride); + const uint32_t* i11 = (const uint32_t*) ((uintptr_t) i10 + input_stride); + const uint32_t* i12 = (const uint32_t*) ((uintptr_t) i11 + input_stride); + const uint32_t* i13 = (const uint32_t*) ((uintptr_t) i12 + input_stride); + const uint32_t* i14 = (const uint32_t*) ((uintptr_t) i13 + input_stride); + const uint32_t* i15 = (const uint32_t*) ((uintptr_t) i14 + input_stride); + const uint32_t* i16 = (const uint32_t*) ((uintptr_t) i15 + input_stride); + const uint32_t* i17 = (const uint32_t*) ((uintptr_t) i16 + input_stride); + const uint32_t* i18 = (const uint32_t*) ((uintptr_t) i17 + input_stride); + const uint32_t* i19 = (const uint32_t*) ((uintptr_t) i18 + input_stride); + const uint32_t* i20 = (const uint32_t*) ((uintptr_t) i19 + input_stride); + const uint32_t* i21 = (const uint32_t*) ((uintptr_t) i20 + input_stride); + const uint32_t* i22 = (const uint32_t*) ((uintptr_t) i21 + input_stride); + const uint32_t* i23 = (const uint32_t*) ((uintptr_t) i22 + input_stride); + const uint32_t* i24 = (const uint32_t*) ((uintptr_t) i23 + input_stride); + const uint32_t* i25 = (const uint32_t*) ((uintptr_t) i24 + input_stride); + const uint32_t* i26 = (const uint32_t*) ((uintptr_t) i25 + input_stride); + const uint32_t* i27 = (const uint32_t*) ((uintptr_t) i26 + input_stride); + const uint32_t* i28 = (const uint32_t*) ((uintptr_t) i27 + input_stride); + const uint32_t* i29 = (const uint32_t*) ((uintptr_t) i28 + input_stride); + const uint32_t* i30 = (const uint32_t*) ((uintptr_t) i29 + input_stride); + const uint32_t* i31 = (const uint32_t*) ((uintptr_t) i30 + input_stride); + uint32_t* o = (uint32_t*) ((uintptr_t) output - tile_hbytes); + const size_t minus_output_stride = -output_stride; + + do { + const size_t rem = min(block_width - 1, 31); + const size_t oN_stride = rem * output_stride; + const size_t oN_offset = oN_stride + tile_hbytes; + size_t bh = block_height; + for (; bh >= 32; bh -= 32) { + const HVX_Vector v5_0 = *((HVX_UVector *) i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); + const HVX_Vector v5_1 = *((HVX_UVector *) i1); i1 = (uint32_t*) ((uintptr_t) i1 + input_offset); + const HVX_Vector v5_2 = *((HVX_UVector *) i2); i2 = (uint32_t*) ((uintptr_t) i2 + input_offset); + const HVX_Vector v5_3 = *((HVX_UVector *) i3); i3 = (uint32_t*) ((uintptr_t) i3 + input_offset); + const HVX_Vector v5_4 = *((HVX_UVector *) i4); i4 = (uint32_t*) ((uintptr_t) i4 + input_offset); + const HVX_Vector v5_5 = *((HVX_UVector *) i5); i5 = (uint32_t*) ((uintptr_t) i5 + input_offset); + const HVX_Vector v5_6 = *((HVX_UVector *) i6); i6 = (uint32_t*) ((uintptr_t) i6 + input_offset); + const HVX_Vector v5_7 = *((HVX_UVector *) i7); i7 = (uint32_t*) ((uintptr_t) i7 + input_offset); + const HVX_Vector v5_8 = *((HVX_UVector *) i8); i8 = (uint32_t*) ((uintptr_t) i8 + input_offset); + const HVX_Vector v5_9 = *((HVX_UVector *) i9); i9 = (uint32_t*) ((uintptr_t) i9 + input_offset); + const HVX_Vector v5_10 = *((HVX_UVector *) i10); i10 = (uint32_t*) ((uintptr_t) i10 + input_offset); + const HVX_Vector v5_11 = *((HVX_UVector *) i11); i11 = (uint32_t*) ((uintptr_t) i11 + input_offset); + const HVX_Vector v5_12 = *((HVX_UVector *) i12); i12 = (uint32_t*) ((uintptr_t) i12 + input_offset); + const HVX_Vector v5_13 = *((HVX_UVector *) i13); i13 = (uint32_t*) ((uintptr_t) i13 + input_offset); + const HVX_Vector v5_14 = *((HVX_UVector *) i14); i14 = (uint32_t*) ((uintptr_t) i14 + input_offset); + const HVX_Vector v5_15 = *((HVX_UVector *) i15); i15 = (uint32_t*) ((uintptr_t) i15 + input_offset); + const HVX_Vector v5_16 = *((HVX_UVector *) i16); i16 = (uint32_t*) ((uintptr_t) i16 + input_offset); + const HVX_Vector v5_17 = *((HVX_UVector *) i17); i17 = (uint32_t*) ((uintptr_t) i17 + input_offset); + const HVX_Vector v5_18 = *((HVX_UVector *) i18); i18 = (uint32_t*) ((uintptr_t) i18 + input_offset); + const HVX_Vector v5_19 = *((HVX_UVector *) i19); i19 = (uint32_t*) ((uintptr_t) i19 + input_offset); + const HVX_Vector v5_20 = *((HVX_UVector *) i20); i20 = (uint32_t*) ((uintptr_t) i20 + input_offset); + const HVX_Vector v5_21 = *((HVX_UVector *) i21); i21 = (uint32_t*) ((uintptr_t) i21 + input_offset); + const HVX_Vector v5_22 = *((HVX_UVector *) i22); i22 = (uint32_t*) ((uintptr_t) i22 + input_offset); + const HVX_Vector v5_23 = *((HVX_UVector *) i23); i23 = (uint32_t*) ((uintptr_t) i23 + input_offset); + const HVX_Vector v5_24 = *((HVX_UVector *) i24); i24 = (uint32_t*) ((uintptr_t) i24 + input_offset); + const HVX_Vector v5_25 = *((HVX_UVector *) i25); i25 = (uint32_t*) ((uintptr_t) i25 + input_offset); + const HVX_Vector v5_26 = *((HVX_UVector *) i26); i26 = (uint32_t*) ((uintptr_t) i26 + input_offset); + const HVX_Vector v5_27 = *((HVX_UVector *) i27); i27 = (uint32_t*) ((uintptr_t) i27 + input_offset); + const HVX_Vector v5_28 = *((HVX_UVector *) i28); i28 = (uint32_t*) ((uintptr_t) i28 + input_offset); + const HVX_Vector v5_29 = *((HVX_UVector *) i29); i29 = (uint32_t*) ((uintptr_t) i29 + input_offset); + const HVX_Vector v5_30 = *((HVX_UVector *) i30); i30 = (uint32_t*) ((uintptr_t) i30 + input_offset); + const HVX_Vector v5_31 = *((HVX_UVector *) i31); i31 = (uint32_t*) ((uintptr_t) i31 + input_offset); + + int rt = -4; + const HVX_VectorPair v4_0 = Q6_W_vshuff_VVR(v5_1, v5_0, rt); + const HVX_VectorPair v4_1 = Q6_W_vshuff_VVR(v5_3, v5_2, rt); + const HVX_VectorPair v4_2 = Q6_W_vshuff_VVR(v5_5, v5_4, rt); + const HVX_VectorPair v4_3 = Q6_W_vshuff_VVR(v5_7, v5_6, rt); + const HVX_VectorPair v4_4 = Q6_W_vshuff_VVR(v5_9, v5_8, rt); + const HVX_VectorPair v4_5 = Q6_W_vshuff_VVR(v5_11, v5_10, rt); + const HVX_VectorPair v4_6 = Q6_W_vshuff_VVR(v5_13, v5_12, rt); + const HVX_VectorPair v4_7 = Q6_W_vshuff_VVR(v5_15, v5_14, rt); + const HVX_VectorPair v4_8 = Q6_W_vshuff_VVR(v5_17, v5_16, rt); + const HVX_VectorPair v4_9 = Q6_W_vshuff_VVR(v5_19, v5_18, rt); + const HVX_VectorPair v4_10 = Q6_W_vshuff_VVR(v5_21, v5_20, rt); + const HVX_VectorPair v4_11 = Q6_W_vshuff_VVR(v5_23, v5_22, rt); + const HVX_VectorPair v4_12 = Q6_W_vshuff_VVR(v5_25, v5_24, rt); + const HVX_VectorPair v4_13 = Q6_W_vshuff_VVR(v5_27, v5_26, rt); + const HVX_VectorPair v4_14 = Q6_W_vshuff_VVR(v5_29, v5_28, rt); + const HVX_VectorPair v4_15 = Q6_W_vshuff_VVR(v5_31, v5_30, rt); + + rt = rt << 1; + HVX_VectorPair v3_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_1), Q6_V_lo_W(v4_0), rt); + HVX_VectorPair v3_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_1), Q6_V_hi_W(v4_0), rt); + + HVX_VectorPair v3_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_3), Q6_V_lo_W(v4_2), rt); + HVX_VectorPair v3_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_3), Q6_V_hi_W(v4_2), rt); + + HVX_VectorPair v3_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_5), Q6_V_lo_W(v4_4), rt); + HVX_VectorPair v3_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_5), Q6_V_hi_W(v4_4), rt); + + HVX_VectorPair v3_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_7), Q6_V_lo_W(v4_6), rt); + HVX_VectorPair v3_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_7), Q6_V_hi_W(v4_6), rt); + + HVX_VectorPair v3_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_9), Q6_V_lo_W(v4_8), rt); + HVX_VectorPair v3_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_9), Q6_V_hi_W(v4_8), rt); + + HVX_VectorPair v3_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_11), Q6_V_lo_W(v4_10), rt); + HVX_VectorPair v3_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_11), Q6_V_hi_W(v4_10), rt); + + HVX_VectorPair v3_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_13), Q6_V_lo_W(v4_12), rt); + HVX_VectorPair v3_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_13), Q6_V_hi_W(v4_12), rt); + + HVX_VectorPair v3_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_15), Q6_V_lo_W(v4_14), rt); + HVX_VectorPair v3_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_15), Q6_V_hi_W(v4_14), rt); + + rt = rt << 1; + HVX_VectorPair v2_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_2), Q6_V_lo_W(v3_0), rt); + HVX_VectorPair v2_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_2), Q6_V_hi_W(v3_0), rt); + + HVX_VectorPair v2_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_3), Q6_V_lo_W(v3_1), rt); + HVX_VectorPair v2_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_3), Q6_V_hi_W(v3_1), rt); + + HVX_VectorPair v2_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_6), Q6_V_lo_W(v3_4), rt); + HVX_VectorPair v2_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_6), Q6_V_hi_W(v3_4), rt); + + HVX_VectorPair v2_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_7), Q6_V_lo_W(v3_5), rt); + HVX_VectorPair v2_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_7), Q6_V_hi_W(v3_5), rt); + + HVX_VectorPair v2_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_10), Q6_V_lo_W(v3_8), rt); + HVX_VectorPair v2_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_10), Q6_V_hi_W(v3_8), rt); + + HVX_VectorPair v2_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_11), Q6_V_lo_W(v3_9), rt); + HVX_VectorPair v2_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_11), Q6_V_hi_W(v3_9), rt); + + HVX_VectorPair v2_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_14), Q6_V_lo_W(v3_12), rt); + HVX_VectorPair v2_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_14), Q6_V_hi_W(v3_12), rt); + + HVX_VectorPair v2_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_15), Q6_V_lo_W(v3_13), rt); + HVX_VectorPair v2_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_15), Q6_V_hi_W(v3_13), rt); + + rt = rt << 1; + HVX_VectorPair v1_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_4), Q6_V_lo_W(v2_0), rt); + HVX_VectorPair v1_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_4), Q6_V_hi_W(v2_0), rt); + + HVX_VectorPair v1_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_5), Q6_V_lo_W(v2_1), rt); + HVX_VectorPair v1_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_5), Q6_V_hi_W(v2_1), rt); + + HVX_VectorPair v1_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_6), Q6_V_lo_W(v2_2), rt); + HVX_VectorPair v1_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_6), Q6_V_hi_W(v2_2), rt); + + HVX_VectorPair v1_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_7), Q6_V_lo_W(v2_3), rt); + HVX_VectorPair v1_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_7), Q6_V_hi_W(v2_3), rt); + + HVX_VectorPair v1_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_12), Q6_V_lo_W(v2_8), rt); + HVX_VectorPair v1_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_12), Q6_V_hi_W(v2_8), rt); + + HVX_VectorPair v1_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_13), Q6_V_lo_W(v2_9), rt); + HVX_VectorPair v1_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_13), Q6_V_hi_W(v2_9), rt); + + HVX_VectorPair v1_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_14), Q6_V_lo_W(v2_10), rt); + HVX_VectorPair v1_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_14), Q6_V_hi_W(v2_10), rt); + + HVX_VectorPair v1_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_15), Q6_V_lo_W(v2_11), rt); + HVX_VectorPair v1_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_15), Q6_V_hi_W(v2_11), rt); + + rt = rt << 1; + HVX_VectorPair v0_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_8), Q6_V_lo_W(v1_0), rt); + HVX_VectorPair v0_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_8), Q6_V_hi_W(v1_0), rt); + + HVX_VectorPair v0_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_9), Q6_V_lo_W(v1_1), rt); + HVX_VectorPair v0_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_9), Q6_V_hi_W(v1_1), rt); + + HVX_VectorPair v0_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_10), Q6_V_lo_W(v1_2), rt); + HVX_VectorPair v0_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_10), Q6_V_hi_W(v1_2), rt); + + HVX_VectorPair v0_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_11), Q6_V_lo_W(v1_3), rt); + HVX_VectorPair v0_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_11), Q6_V_hi_W(v1_3), rt); + + HVX_VectorPair v0_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_12), Q6_V_lo_W(v1_4), rt); + HVX_VectorPair v0_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_12), Q6_V_hi_W(v1_4), rt); + + HVX_VectorPair v0_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_13), Q6_V_lo_W(v1_5), rt); + HVX_VectorPair v0_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_13), Q6_V_hi_W(v1_5), rt); + + HVX_VectorPair v0_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_14), Q6_V_lo_W(v1_6), rt); + HVX_VectorPair v0_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_14), Q6_V_hi_W(v1_6), rt); + + HVX_VectorPair v0_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_15), Q6_V_lo_W(v1_7), rt); + HVX_VectorPair v0_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_15), Q6_V_hi_W(v1_7), rt); + + o = (uint32_t*) ((uintptr_t) o + oN_offset); + xnn_storeu_f32(o, Q6_V_hi_W(v0_15)); + uint32_t *oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 31) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_15)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 30) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_14)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 29) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_14)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 28) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_13)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 27) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_13)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 26) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_12)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 25) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_12)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 24) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_11)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 23) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_11)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 22) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_10)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 21) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_10)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 20) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_9)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 19) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_9)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 18) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_8)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 17) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_8)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 16) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_7)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 15) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_7)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 14) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_6)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 13) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_6)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 12) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_5)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 11) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_5)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 10) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_4)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 9) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_4)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 8) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_3)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 7) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_3)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 6) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_2)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 5) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_2)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 4) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_1)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 3) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_1)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= 2) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_0)); + oN = (uint32_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 1) { + o = oN; + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_0)); + } + o = (uint32_t*) ((uintptr_t) o + tile_hbytes); + + + i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); + i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + i4 = (const uint32_t*) ((uintptr_t) i3 + input_stride); + i5 = (const uint32_t*) ((uintptr_t) i4 + input_stride); + i6 = (const uint32_t*) ((uintptr_t) i5 + input_stride); + i7 = (const uint32_t*) ((uintptr_t) i6 + input_stride); + i8 = (const uint32_t*) ((uintptr_t) i7 + input_stride); + i9 = (const uint32_t*) ((uintptr_t) i8 + input_stride); + i10 = (const uint32_t*) ((uintptr_t) i9 + input_stride); + i11 = (const uint32_t*) ((uintptr_t) i10 + input_stride); + i12 = (const uint32_t*) ((uintptr_t) i11 + input_stride); + i13 = (const uint32_t*) ((uintptr_t) i12 + input_stride); + i14 = (const uint32_t*) ((uintptr_t) i13 + input_stride); + i15 = (const uint32_t*) ((uintptr_t) i14 + input_stride); + i16 = (const uint32_t*) ((uintptr_t) i15 + input_stride); + i17 = (const uint32_t*) ((uintptr_t) i16 + input_stride); + i18 = (const uint32_t*) ((uintptr_t) i17 + input_stride); + i19 = (const uint32_t*) ((uintptr_t) i18 + input_stride); + i20 = (const uint32_t*) ((uintptr_t) i19 + input_stride); + i21 = (const uint32_t*) ((uintptr_t) i20 + input_stride); + i22 = (const uint32_t*) ((uintptr_t) i21 + input_stride); + i23 = (const uint32_t*) ((uintptr_t) i22 + input_stride); + i24 = (const uint32_t*) ((uintptr_t) i23 + input_stride); + i25 = (const uint32_t*) ((uintptr_t) i24 + input_stride); + i26 = (const uint32_t*) ((uintptr_t) i25 + input_stride); + i27 = (const uint32_t*) ((uintptr_t) i26 + input_stride); + i28 = (const uint32_t*) ((uintptr_t) i27 + input_stride); + i29 = (const uint32_t*) ((uintptr_t) i28 + input_stride); + i30 = (const uint32_t*) ((uintptr_t) i29 + input_stride); + i31 = (const uint32_t*) ((uintptr_t) i30 + input_stride); + o = (uint32_t*) ((uintptr_t) o + output_reset); + block_width = doz(block_width, tile_width); + } while (block_width != 0); +} + + diff --git a/src/x32-transposec/gen/x32-transposec-32x32-multi-multi-hvx.c b/src/x32-transposec/gen/x32-transposec-32x32-multi-multi-hvx.c new file mode 100644 index 00000000000..468ad494fb5 --- /dev/null +++ b/src/x32-transposec/gen/x32-transposec-32x32-multi-multi-hvx.c @@ -0,0 +1,453 @@ +// Auto-generated file. Do not edit! +// Template: src/x32-transposec/hvx.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/math.h" +#include "xnnpack/transpose.h" + +void xnn_x32_transposec_ukernel__32x32_multi_multi_hvx( + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) XNN_OOB_READS +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint32_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint32_t)); + + const size_t tile_height = 32; + const size_t tile_width = 32; + const size_t tile_hbytes = tile_height * sizeof(uint32_t); + const size_t tile_wbytes = tile_width * sizeof(uint32_t); + const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; + const size_t input_offset = tile_height * input_stride; + const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t); + + const uint32_t* i0 = input; + const uint32_t* i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + const uint32_t* i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + const uint32_t* i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + const uint32_t* i4 = (const uint32_t*) ((uintptr_t) i3 + input_stride); + const uint32_t* i5 = (const uint32_t*) ((uintptr_t) i4 + input_stride); + const uint32_t* i6 = (const uint32_t*) ((uintptr_t) i5 + input_stride); + const uint32_t* i7 = (const uint32_t*) ((uintptr_t) i6 + input_stride); + const uint32_t* i8 = (const uint32_t*) ((uintptr_t) i7 + input_stride); + const uint32_t* i9 = (const uint32_t*) ((uintptr_t) i8 + input_stride); + const uint32_t* i10 = (const uint32_t*) ((uintptr_t) i9 + input_stride); + const uint32_t* i11 = (const uint32_t*) ((uintptr_t) i10 + input_stride); + const uint32_t* i12 = (const uint32_t*) ((uintptr_t) i11 + input_stride); + const uint32_t* i13 = (const uint32_t*) ((uintptr_t) i12 + input_stride); + const uint32_t* i14 = (const uint32_t*) ((uintptr_t) i13 + input_stride); + const uint32_t* i15 = (const uint32_t*) ((uintptr_t) i14 + input_stride); + const uint32_t* i16 = (const uint32_t*) ((uintptr_t) i15 + input_stride); + const uint32_t* i17 = (const uint32_t*) ((uintptr_t) i16 + input_stride); + const uint32_t* i18 = (const uint32_t*) ((uintptr_t) i17 + input_stride); + const uint32_t* i19 = (const uint32_t*) ((uintptr_t) i18 + input_stride); + const uint32_t* i20 = (const uint32_t*) ((uintptr_t) i19 + input_stride); + const uint32_t* i21 = (const uint32_t*) ((uintptr_t) i20 + input_stride); + const uint32_t* i22 = (const uint32_t*) ((uintptr_t) i21 + input_stride); + const uint32_t* i23 = (const uint32_t*) ((uintptr_t) i22 + input_stride); + const uint32_t* i24 = (const uint32_t*) ((uintptr_t) i23 + input_stride); + const uint32_t* i25 = (const uint32_t*) ((uintptr_t) i24 + input_stride); + const uint32_t* i26 = (const uint32_t*) ((uintptr_t) i25 + input_stride); + const uint32_t* i27 = (const uint32_t*) ((uintptr_t) i26 + input_stride); + const uint32_t* i28 = (const uint32_t*) ((uintptr_t) i27 + input_stride); + const uint32_t* i29 = (const uint32_t*) ((uintptr_t) i28 + input_stride); + const uint32_t* i30 = (const uint32_t*) ((uintptr_t) i29 + input_stride); + const uint32_t* i31 = (const uint32_t*) ((uintptr_t) i30 + input_stride); + uint32_t* o0 = (uint32_t*) output; + uint32_t* o1 = (uint32_t*) ((uintptr_t) o0 + output_stride); + uint32_t* o2 = (uint32_t*) ((uintptr_t) o1 + output_stride); + uint32_t* o3 = (uint32_t*) ((uintptr_t) o2 + output_stride); + uint32_t* o4 = (uint32_t*) ((uintptr_t) o3 + output_stride); + uint32_t* o5 = (uint32_t*) ((uintptr_t) o4 + output_stride); + uint32_t* o6 = (uint32_t*) ((uintptr_t) o5 + output_stride); + uint32_t* o7 = (uint32_t*) ((uintptr_t) o6 + output_stride); + uint32_t* o8 = (uint32_t*) ((uintptr_t) o7 + output_stride); + uint32_t* o9 = (uint32_t*) ((uintptr_t) o8 + output_stride); + uint32_t* o10 = (uint32_t*) ((uintptr_t) o9 + output_stride); + uint32_t* o11 = (uint32_t*) ((uintptr_t) o10 + output_stride); + uint32_t* o12 = (uint32_t*) ((uintptr_t) o11 + output_stride); + uint32_t* o13 = (uint32_t*) ((uintptr_t) o12 + output_stride); + uint32_t* o14 = (uint32_t*) ((uintptr_t) o13 + output_stride); + uint32_t* o15 = (uint32_t*) ((uintptr_t) o14 + output_stride); + uint32_t* o16 = (uint32_t*) ((uintptr_t) o15 + output_stride); + uint32_t* o17 = (uint32_t*) ((uintptr_t) o16 + output_stride); + uint32_t* o18 = (uint32_t*) ((uintptr_t) o17 + output_stride); + uint32_t* o19 = (uint32_t*) ((uintptr_t) o18 + output_stride); + uint32_t* o20 = (uint32_t*) ((uintptr_t) o19 + output_stride); + uint32_t* o21 = (uint32_t*) ((uintptr_t) o20 + output_stride); + uint32_t* o22 = (uint32_t*) ((uintptr_t) o21 + output_stride); + uint32_t* o23 = (uint32_t*) ((uintptr_t) o22 + output_stride); + uint32_t* o24 = (uint32_t*) ((uintptr_t) o23 + output_stride); + uint32_t* o25 = (uint32_t*) ((uintptr_t) o24 + output_stride); + uint32_t* o26 = (uint32_t*) ((uintptr_t) o25 + output_stride); + uint32_t* o27 = (uint32_t*) ((uintptr_t) o26 + output_stride); + uint32_t* o28 = (uint32_t*) ((uintptr_t) o27 + output_stride); + uint32_t* o29 = (uint32_t*) ((uintptr_t) o28 + output_stride); + uint32_t* o30 = (uint32_t*) ((uintptr_t) o29 + output_stride); + uint32_t* o31 = (uint32_t*) ((uintptr_t) o30 + output_stride); + const size_t minus_output_stride = -output_stride; + + do { + if XNN_UNPREDICTABLE(block_width < 2) { + o1 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 2) { + o2 = o0; + } + if XNN_UNPREDICTABLE(block_width < 4) { + o3 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 4) { + o4 = o0; + } + if XNN_UNPREDICTABLE(block_width < 6) { + o5 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 6) { + o6 = o0; + } + if XNN_UNPREDICTABLE(block_width < 8) { + o7 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 8) { + o8 = o0; + } + if XNN_UNPREDICTABLE(block_width < 10) { + o9 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 10) { + o10 = o0; + } + if XNN_UNPREDICTABLE(block_width < 12) { + o11 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 12) { + o12 = o0; + } + if XNN_UNPREDICTABLE(block_width < 14) { + o13 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 14) { + o14 = o0; + } + if XNN_UNPREDICTABLE(block_width < 16) { + o15 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 16) { + o16 = o0; + } + if XNN_UNPREDICTABLE(block_width < 18) { + o17 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 18) { + o18 = o0; + } + if XNN_UNPREDICTABLE(block_width < 20) { + o19 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 20) { + o20 = o0; + } + if XNN_UNPREDICTABLE(block_width < 22) { + o21 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 22) { + o22 = o0; + } + if XNN_UNPREDICTABLE(block_width < 24) { + o23 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 24) { + o24 = o0; + } + if XNN_UNPREDICTABLE(block_width < 26) { + o25 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 26) { + o26 = o0; + } + if XNN_UNPREDICTABLE(block_width < 28) { + o27 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 28) { + o28 = o0; + } + if XNN_UNPREDICTABLE(block_width < 30) { + o29 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 30) { + o30 = o0; + } + if XNN_UNPREDICTABLE(block_width < 32) { + o31 = o0; + } + size_t bh = block_height; + for (; bh >= 32; bh -= 32) { + const HVX_Vector v5_0 = *((HVX_UVector *) i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); + const HVX_Vector v5_1 = *((HVX_UVector *) i1); i1 = (uint32_t*) ((uintptr_t) i1 + input_offset); + const HVX_Vector v5_2 = *((HVX_UVector *) i2); i2 = (uint32_t*) ((uintptr_t) i2 + input_offset); + const HVX_Vector v5_3 = *((HVX_UVector *) i3); i3 = (uint32_t*) ((uintptr_t) i3 + input_offset); + const HVX_Vector v5_4 = *((HVX_UVector *) i4); i4 = (uint32_t*) ((uintptr_t) i4 + input_offset); + const HVX_Vector v5_5 = *((HVX_UVector *) i5); i5 = (uint32_t*) ((uintptr_t) i5 + input_offset); + const HVX_Vector v5_6 = *((HVX_UVector *) i6); i6 = (uint32_t*) ((uintptr_t) i6 + input_offset); + const HVX_Vector v5_7 = *((HVX_UVector *) i7); i7 = (uint32_t*) ((uintptr_t) i7 + input_offset); + const HVX_Vector v5_8 = *((HVX_UVector *) i8); i8 = (uint32_t*) ((uintptr_t) i8 + input_offset); + const HVX_Vector v5_9 = *((HVX_UVector *) i9); i9 = (uint32_t*) ((uintptr_t) i9 + input_offset); + const HVX_Vector v5_10 = *((HVX_UVector *) i10); i10 = (uint32_t*) ((uintptr_t) i10 + input_offset); + const HVX_Vector v5_11 = *((HVX_UVector *) i11); i11 = (uint32_t*) ((uintptr_t) i11 + input_offset); + const HVX_Vector v5_12 = *((HVX_UVector *) i12); i12 = (uint32_t*) ((uintptr_t) i12 + input_offset); + const HVX_Vector v5_13 = *((HVX_UVector *) i13); i13 = (uint32_t*) ((uintptr_t) i13 + input_offset); + const HVX_Vector v5_14 = *((HVX_UVector *) i14); i14 = (uint32_t*) ((uintptr_t) i14 + input_offset); + const HVX_Vector v5_15 = *((HVX_UVector *) i15); i15 = (uint32_t*) ((uintptr_t) i15 + input_offset); + const HVX_Vector v5_16 = *((HVX_UVector *) i16); i16 = (uint32_t*) ((uintptr_t) i16 + input_offset); + const HVX_Vector v5_17 = *((HVX_UVector *) i17); i17 = (uint32_t*) ((uintptr_t) i17 + input_offset); + const HVX_Vector v5_18 = *((HVX_UVector *) i18); i18 = (uint32_t*) ((uintptr_t) i18 + input_offset); + const HVX_Vector v5_19 = *((HVX_UVector *) i19); i19 = (uint32_t*) ((uintptr_t) i19 + input_offset); + const HVX_Vector v5_20 = *((HVX_UVector *) i20); i20 = (uint32_t*) ((uintptr_t) i20 + input_offset); + const HVX_Vector v5_21 = *((HVX_UVector *) i21); i21 = (uint32_t*) ((uintptr_t) i21 + input_offset); + const HVX_Vector v5_22 = *((HVX_UVector *) i22); i22 = (uint32_t*) ((uintptr_t) i22 + input_offset); + const HVX_Vector v5_23 = *((HVX_UVector *) i23); i23 = (uint32_t*) ((uintptr_t) i23 + input_offset); + const HVX_Vector v5_24 = *((HVX_UVector *) i24); i24 = (uint32_t*) ((uintptr_t) i24 + input_offset); + const HVX_Vector v5_25 = *((HVX_UVector *) i25); i25 = (uint32_t*) ((uintptr_t) i25 + input_offset); + const HVX_Vector v5_26 = *((HVX_UVector *) i26); i26 = (uint32_t*) ((uintptr_t) i26 + input_offset); + const HVX_Vector v5_27 = *((HVX_UVector *) i27); i27 = (uint32_t*) ((uintptr_t) i27 + input_offset); + const HVX_Vector v5_28 = *((HVX_UVector *) i28); i28 = (uint32_t*) ((uintptr_t) i28 + input_offset); + const HVX_Vector v5_29 = *((HVX_UVector *) i29); i29 = (uint32_t*) ((uintptr_t) i29 + input_offset); + const HVX_Vector v5_30 = *((HVX_UVector *) i30); i30 = (uint32_t*) ((uintptr_t) i30 + input_offset); + const HVX_Vector v5_31 = *((HVX_UVector *) i31); i31 = (uint32_t*) ((uintptr_t) i31 + input_offset); + + int rt = -4; + const HVX_VectorPair v4_0 = Q6_W_vshuff_VVR(v5_1, v5_0, rt); + const HVX_VectorPair v4_1 = Q6_W_vshuff_VVR(v5_3, v5_2, rt); + const HVX_VectorPair v4_2 = Q6_W_vshuff_VVR(v5_5, v5_4, rt); + const HVX_VectorPair v4_3 = Q6_W_vshuff_VVR(v5_7, v5_6, rt); + const HVX_VectorPair v4_4 = Q6_W_vshuff_VVR(v5_9, v5_8, rt); + const HVX_VectorPair v4_5 = Q6_W_vshuff_VVR(v5_11, v5_10, rt); + const HVX_VectorPair v4_6 = Q6_W_vshuff_VVR(v5_13, v5_12, rt); + const HVX_VectorPair v4_7 = Q6_W_vshuff_VVR(v5_15, v5_14, rt); + const HVX_VectorPair v4_8 = Q6_W_vshuff_VVR(v5_17, v5_16, rt); + const HVX_VectorPair v4_9 = Q6_W_vshuff_VVR(v5_19, v5_18, rt); + const HVX_VectorPair v4_10 = Q6_W_vshuff_VVR(v5_21, v5_20, rt); + const HVX_VectorPair v4_11 = Q6_W_vshuff_VVR(v5_23, v5_22, rt); + const HVX_VectorPair v4_12 = Q6_W_vshuff_VVR(v5_25, v5_24, rt); + const HVX_VectorPair v4_13 = Q6_W_vshuff_VVR(v5_27, v5_26, rt); + const HVX_VectorPair v4_14 = Q6_W_vshuff_VVR(v5_29, v5_28, rt); + const HVX_VectorPair v4_15 = Q6_W_vshuff_VVR(v5_31, v5_30, rt); + + rt = rt << 1; + HVX_VectorPair v3_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_1), Q6_V_lo_W(v4_0), rt); + HVX_VectorPair v3_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_1), Q6_V_hi_W(v4_0), rt); + + HVX_VectorPair v3_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_3), Q6_V_lo_W(v4_2), rt); + HVX_VectorPair v3_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_3), Q6_V_hi_W(v4_2), rt); + + HVX_VectorPair v3_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_5), Q6_V_lo_W(v4_4), rt); + HVX_VectorPair v3_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_5), Q6_V_hi_W(v4_4), rt); + + HVX_VectorPair v3_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_7), Q6_V_lo_W(v4_6), rt); + HVX_VectorPair v3_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_7), Q6_V_hi_W(v4_6), rt); + + HVX_VectorPair v3_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_9), Q6_V_lo_W(v4_8), rt); + HVX_VectorPair v3_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_9), Q6_V_hi_W(v4_8), rt); + + HVX_VectorPair v3_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_11), Q6_V_lo_W(v4_10), rt); + HVX_VectorPair v3_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_11), Q6_V_hi_W(v4_10), rt); + + HVX_VectorPair v3_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_13), Q6_V_lo_W(v4_12), rt); + HVX_VectorPair v3_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_13), Q6_V_hi_W(v4_12), rt); + + HVX_VectorPair v3_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_15), Q6_V_lo_W(v4_14), rt); + HVX_VectorPair v3_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_15), Q6_V_hi_W(v4_14), rt); + + rt = rt << 1; + HVX_VectorPair v2_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_2), Q6_V_lo_W(v3_0), rt); + HVX_VectorPair v2_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_2), Q6_V_hi_W(v3_0), rt); + + HVX_VectorPair v2_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_3), Q6_V_lo_W(v3_1), rt); + HVX_VectorPair v2_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_3), Q6_V_hi_W(v3_1), rt); + + HVX_VectorPair v2_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_6), Q6_V_lo_W(v3_4), rt); + HVX_VectorPair v2_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_6), Q6_V_hi_W(v3_4), rt); + + HVX_VectorPair v2_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_7), Q6_V_lo_W(v3_5), rt); + HVX_VectorPair v2_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_7), Q6_V_hi_W(v3_5), rt); + + HVX_VectorPair v2_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_10), Q6_V_lo_W(v3_8), rt); + HVX_VectorPair v2_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_10), Q6_V_hi_W(v3_8), rt); + + HVX_VectorPair v2_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_11), Q6_V_lo_W(v3_9), rt); + HVX_VectorPair v2_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_11), Q6_V_hi_W(v3_9), rt); + + HVX_VectorPair v2_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_14), Q6_V_lo_W(v3_12), rt); + HVX_VectorPair v2_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_14), Q6_V_hi_W(v3_12), rt); + + HVX_VectorPair v2_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_15), Q6_V_lo_W(v3_13), rt); + HVX_VectorPair v2_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_15), Q6_V_hi_W(v3_13), rt); + + rt = rt << 1; + HVX_VectorPair v1_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_4), Q6_V_lo_W(v2_0), rt); + HVX_VectorPair v1_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_4), Q6_V_hi_W(v2_0), rt); + + HVX_VectorPair v1_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_5), Q6_V_lo_W(v2_1), rt); + HVX_VectorPair v1_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_5), Q6_V_hi_W(v2_1), rt); + + HVX_VectorPair v1_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_6), Q6_V_lo_W(v2_2), rt); + HVX_VectorPair v1_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_6), Q6_V_hi_W(v2_2), rt); + + HVX_VectorPair v1_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_7), Q6_V_lo_W(v2_3), rt); + HVX_VectorPair v1_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_7), Q6_V_hi_W(v2_3), rt); + + HVX_VectorPair v1_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_12), Q6_V_lo_W(v2_8), rt); + HVX_VectorPair v1_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_12), Q6_V_hi_W(v2_8), rt); + + HVX_VectorPair v1_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_13), Q6_V_lo_W(v2_9), rt); + HVX_VectorPair v1_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_13), Q6_V_hi_W(v2_9), rt); + + HVX_VectorPair v1_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_14), Q6_V_lo_W(v2_10), rt); + HVX_VectorPair v1_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_14), Q6_V_hi_W(v2_10), rt); + + HVX_VectorPair v1_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_15), Q6_V_lo_W(v2_11), rt); + HVX_VectorPair v1_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_15), Q6_V_hi_W(v2_11), rt); + + rt = rt << 1; + HVX_VectorPair v0_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_8), Q6_V_lo_W(v1_0), rt); + HVX_VectorPair v0_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_8), Q6_V_hi_W(v1_0), rt); + + HVX_VectorPair v0_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_9), Q6_V_lo_W(v1_1), rt); + HVX_VectorPair v0_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_9), Q6_V_hi_W(v1_1), rt); + + HVX_VectorPair v0_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_10), Q6_V_lo_W(v1_2), rt); + HVX_VectorPair v0_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_10), Q6_V_hi_W(v1_2), rt); + + HVX_VectorPair v0_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_11), Q6_V_lo_W(v1_3), rt); + HVX_VectorPair v0_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_11), Q6_V_hi_W(v1_3), rt); + + HVX_VectorPair v0_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_12), Q6_V_lo_W(v1_4), rt); + HVX_VectorPair v0_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_12), Q6_V_hi_W(v1_4), rt); + + HVX_VectorPair v0_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_13), Q6_V_lo_W(v1_5), rt); + HVX_VectorPair v0_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_13), Q6_V_hi_W(v1_5), rt); + + HVX_VectorPair v0_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_14), Q6_V_lo_W(v1_6), rt); + HVX_VectorPair v0_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_14), Q6_V_hi_W(v1_6), rt); + + HVX_VectorPair v0_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_15), Q6_V_lo_W(v1_7), rt); + HVX_VectorPair v0_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_15), Q6_V_hi_W(v1_7), rt); + + xnn_storeu_f32(o31, Q6_V_hi_W(v0_15)); o31 = (uint32_t*) ((uintptr_t) o31 + tile_hbytes); + xnn_storeu_f32(o30, Q6_V_lo_W(v0_15)); o30 = (uint32_t*) ((uintptr_t) o30 + tile_hbytes); + xnn_storeu_f32(o29, Q6_V_hi_W(v0_14)); o29 = (uint32_t*) ((uintptr_t) o29 + tile_hbytes); + xnn_storeu_f32(o28, Q6_V_lo_W(v0_14)); o28 = (uint32_t*) ((uintptr_t) o28 + tile_hbytes); + xnn_storeu_f32(o27, Q6_V_hi_W(v0_13)); o27 = (uint32_t*) ((uintptr_t) o27 + tile_hbytes); + xnn_storeu_f32(o26, Q6_V_lo_W(v0_13)); o26 = (uint32_t*) ((uintptr_t) o26 + tile_hbytes); + xnn_storeu_f32(o25, Q6_V_hi_W(v0_12)); o25 = (uint32_t*) ((uintptr_t) o25 + tile_hbytes); + xnn_storeu_f32(o24, Q6_V_lo_W(v0_12)); o24 = (uint32_t*) ((uintptr_t) o24 + tile_hbytes); + xnn_storeu_f32(o23, Q6_V_hi_W(v0_11)); o23 = (uint32_t*) ((uintptr_t) o23 + tile_hbytes); + xnn_storeu_f32(o22, Q6_V_lo_W(v0_11)); o22 = (uint32_t*) ((uintptr_t) o22 + tile_hbytes); + xnn_storeu_f32(o21, Q6_V_hi_W(v0_10)); o21 = (uint32_t*) ((uintptr_t) o21 + tile_hbytes); + xnn_storeu_f32(o20, Q6_V_lo_W(v0_10)); o20 = (uint32_t*) ((uintptr_t) o20 + tile_hbytes); + xnn_storeu_f32(o19, Q6_V_hi_W(v0_9)); o19 = (uint32_t*) ((uintptr_t) o19 + tile_hbytes); + xnn_storeu_f32(o18, Q6_V_lo_W(v0_9)); o18 = (uint32_t*) ((uintptr_t) o18 + tile_hbytes); + xnn_storeu_f32(o17, Q6_V_hi_W(v0_8)); o17 = (uint32_t*) ((uintptr_t) o17 + tile_hbytes); + xnn_storeu_f32(o16, Q6_V_lo_W(v0_8)); o16 = (uint32_t*) ((uintptr_t) o16 + tile_hbytes); + xnn_storeu_f32(o15, Q6_V_hi_W(v0_7)); o15 = (uint32_t*) ((uintptr_t) o15 + tile_hbytes); + xnn_storeu_f32(o14, Q6_V_lo_W(v0_7)); o14 = (uint32_t*) ((uintptr_t) o14 + tile_hbytes); + xnn_storeu_f32(o13, Q6_V_hi_W(v0_6)); o13 = (uint32_t*) ((uintptr_t) o13 + tile_hbytes); + xnn_storeu_f32(o12, Q6_V_lo_W(v0_6)); o12 = (uint32_t*) ((uintptr_t) o12 + tile_hbytes); + xnn_storeu_f32(o11, Q6_V_hi_W(v0_5)); o11 = (uint32_t*) ((uintptr_t) o11 + tile_hbytes); + xnn_storeu_f32(o10, Q6_V_lo_W(v0_5)); o10 = (uint32_t*) ((uintptr_t) o10 + tile_hbytes); + xnn_storeu_f32(o9, Q6_V_hi_W(v0_4)); o9 = (uint32_t*) ((uintptr_t) o9 + tile_hbytes); + xnn_storeu_f32(o8, Q6_V_lo_W(v0_4)); o8 = (uint32_t*) ((uintptr_t) o8 + tile_hbytes); + xnn_storeu_f32(o7, Q6_V_hi_W(v0_3)); o7 = (uint32_t*) ((uintptr_t) o7 + tile_hbytes); + xnn_storeu_f32(o6, Q6_V_lo_W(v0_3)); o6 = (uint32_t*) ((uintptr_t) o6 + tile_hbytes); + xnn_storeu_f32(o5, Q6_V_hi_W(v0_2)); o5 = (uint32_t*) ((uintptr_t) o5 + tile_hbytes); + xnn_storeu_f32(o4, Q6_V_lo_W(v0_2)); o4 = (uint32_t*) ((uintptr_t) o4 + tile_hbytes); + xnn_storeu_f32(o3, Q6_V_hi_W(v0_1)); o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); + xnn_storeu_f32(o2, Q6_V_lo_W(v0_1)); o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); + xnn_storeu_f32(o1, Q6_V_hi_W(v0_0)); o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); + xnn_storeu_f32(o0, Q6_V_lo_W(v0_0)); o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); + } + + + i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); + i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + i4 = (const uint32_t*) ((uintptr_t) i3 + input_stride); + i5 = (const uint32_t*) ((uintptr_t) i4 + input_stride); + i6 = (const uint32_t*) ((uintptr_t) i5 + input_stride); + i7 = (const uint32_t*) ((uintptr_t) i6 + input_stride); + i8 = (const uint32_t*) ((uintptr_t) i7 + input_stride); + i9 = (const uint32_t*) ((uintptr_t) i8 + input_stride); + i10 = (const uint32_t*) ((uintptr_t) i9 + input_stride); + i11 = (const uint32_t*) ((uintptr_t) i10 + input_stride); + i12 = (const uint32_t*) ((uintptr_t) i11 + input_stride); + i13 = (const uint32_t*) ((uintptr_t) i12 + input_stride); + i14 = (const uint32_t*) ((uintptr_t) i13 + input_stride); + i15 = (const uint32_t*) ((uintptr_t) i14 + input_stride); + i16 = (const uint32_t*) ((uintptr_t) i15 + input_stride); + i17 = (const uint32_t*) ((uintptr_t) i16 + input_stride); + i18 = (const uint32_t*) ((uintptr_t) i17 + input_stride); + i19 = (const uint32_t*) ((uintptr_t) i18 + input_stride); + i20 = (const uint32_t*) ((uintptr_t) i19 + input_stride); + i21 = (const uint32_t*) ((uintptr_t) i20 + input_stride); + i22 = (const uint32_t*) ((uintptr_t) i21 + input_stride); + i23 = (const uint32_t*) ((uintptr_t) i22 + input_stride); + i24 = (const uint32_t*) ((uintptr_t) i23 + input_stride); + i25 = (const uint32_t*) ((uintptr_t) i24 + input_stride); + i26 = (const uint32_t*) ((uintptr_t) i25 + input_stride); + i27 = (const uint32_t*) ((uintptr_t) i26 + input_stride); + i28 = (const uint32_t*) ((uintptr_t) i27 + input_stride); + i29 = (const uint32_t*) ((uintptr_t) i28 + input_stride); + i30 = (const uint32_t*) ((uintptr_t) i29 + input_stride); + i31 = (const uint32_t*) ((uintptr_t) i30 + input_stride); + o0 = (uint32_t*) ((uintptr_t) o0 + output_reset); + o1 = (uint32_t*) ((uintptr_t) o1 + output_reset); + o2 = (uint32_t*) ((uintptr_t) o2 + output_reset); + o3 = (uint32_t*) ((uintptr_t) o3 + output_reset); + o4 = (uint32_t*) ((uintptr_t) o4 + output_reset); + o5 = (uint32_t*) ((uintptr_t) o5 + output_reset); + o6 = (uint32_t*) ((uintptr_t) o6 + output_reset); + o7 = (uint32_t*) ((uintptr_t) o7 + output_reset); + o8 = (uint32_t*) ((uintptr_t) o8 + output_reset); + o9 = (uint32_t*) ((uintptr_t) o9 + output_reset); + o10 = (uint32_t*) ((uintptr_t) o10 + output_reset); + o11 = (uint32_t*) ((uintptr_t) o11 + output_reset); + o12 = (uint32_t*) ((uintptr_t) o12 + output_reset); + o13 = (uint32_t*) ((uintptr_t) o13 + output_reset); + o14 = (uint32_t*) ((uintptr_t) o14 + output_reset); + o15 = (uint32_t*) ((uintptr_t) o15 + output_reset); + o16 = (uint32_t*) ((uintptr_t) o16 + output_reset); + o17 = (uint32_t*) ((uintptr_t) o17 + output_reset); + o18 = (uint32_t*) ((uintptr_t) o18 + output_reset); + o19 = (uint32_t*) ((uintptr_t) o19 + output_reset); + o20 = (uint32_t*) ((uintptr_t) o20 + output_reset); + o21 = (uint32_t*) ((uintptr_t) o21 + output_reset); + o22 = (uint32_t*) ((uintptr_t) o22 + output_reset); + o23 = (uint32_t*) ((uintptr_t) o23 + output_reset); + o24 = (uint32_t*) ((uintptr_t) o24 + output_reset); + o25 = (uint32_t*) ((uintptr_t) o25 + output_reset); + o26 = (uint32_t*) ((uintptr_t) o26 + output_reset); + o27 = (uint32_t*) ((uintptr_t) o27 + output_reset); + o28 = (uint32_t*) ((uintptr_t) o28 + output_reset); + o29 = (uint32_t*) ((uintptr_t) o29 + output_reset); + o30 = (uint32_t*) ((uintptr_t) o30 + output_reset); + o31 = (uint32_t*) ((uintptr_t) o31 + output_reset); + block_width = doz(block_width, tile_width); + } while (block_width != 0); +} + + diff --git a/src/x32-transposec/gen/x32-transposec-32x32-multi-switch-hvx.c b/src/x32-transposec/gen/x32-transposec-32x32-multi-switch-hvx.c new file mode 100644 index 00000000000..f61b073a206 --- /dev/null +++ b/src/x32-transposec/gen/x32-transposec-32x32-multi-switch-hvx.c @@ -0,0 +1,400 @@ +// Auto-generated file. Do not edit! +// Template: src/x32-transposec/hvx.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/math.h" +#include "xnnpack/transpose.h" + +void xnn_x32_transposec_ukernel__32x32_multi_switch_hvx( + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) XNN_OOB_READS +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint32_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint32_t)); + + const size_t tile_height = 32; + const size_t tile_width = 32; + const size_t tile_hbytes = tile_height * sizeof(uint32_t); + const size_t tile_wbytes = tile_width * sizeof(uint32_t); + const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; + const size_t input_offset = tile_height * input_stride; + const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t); + + const uint32_t* i0 = input; + const uint32_t* i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + const uint32_t* i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + const uint32_t* i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + const uint32_t* i4 = (const uint32_t*) ((uintptr_t) i3 + input_stride); + const uint32_t* i5 = (const uint32_t*) ((uintptr_t) i4 + input_stride); + const uint32_t* i6 = (const uint32_t*) ((uintptr_t) i5 + input_stride); + const uint32_t* i7 = (const uint32_t*) ((uintptr_t) i6 + input_stride); + const uint32_t* i8 = (const uint32_t*) ((uintptr_t) i7 + input_stride); + const uint32_t* i9 = (const uint32_t*) ((uintptr_t) i8 + input_stride); + const uint32_t* i10 = (const uint32_t*) ((uintptr_t) i9 + input_stride); + const uint32_t* i11 = (const uint32_t*) ((uintptr_t) i10 + input_stride); + const uint32_t* i12 = (const uint32_t*) ((uintptr_t) i11 + input_stride); + const uint32_t* i13 = (const uint32_t*) ((uintptr_t) i12 + input_stride); + const uint32_t* i14 = (const uint32_t*) ((uintptr_t) i13 + input_stride); + const uint32_t* i15 = (const uint32_t*) ((uintptr_t) i14 + input_stride); + const uint32_t* i16 = (const uint32_t*) ((uintptr_t) i15 + input_stride); + const uint32_t* i17 = (const uint32_t*) ((uintptr_t) i16 + input_stride); + const uint32_t* i18 = (const uint32_t*) ((uintptr_t) i17 + input_stride); + const uint32_t* i19 = (const uint32_t*) ((uintptr_t) i18 + input_stride); + const uint32_t* i20 = (const uint32_t*) ((uintptr_t) i19 + input_stride); + const uint32_t* i21 = (const uint32_t*) ((uintptr_t) i20 + input_stride); + const uint32_t* i22 = (const uint32_t*) ((uintptr_t) i21 + input_stride); + const uint32_t* i23 = (const uint32_t*) ((uintptr_t) i22 + input_stride); + const uint32_t* i24 = (const uint32_t*) ((uintptr_t) i23 + input_stride); + const uint32_t* i25 = (const uint32_t*) ((uintptr_t) i24 + input_stride); + const uint32_t* i26 = (const uint32_t*) ((uintptr_t) i25 + input_stride); + const uint32_t* i27 = (const uint32_t*) ((uintptr_t) i26 + input_stride); + const uint32_t* i28 = (const uint32_t*) ((uintptr_t) i27 + input_stride); + const uint32_t* i29 = (const uint32_t*) ((uintptr_t) i28 + input_stride); + const uint32_t* i30 = (const uint32_t*) ((uintptr_t) i29 + input_stride); + const uint32_t* i31 = (const uint32_t*) ((uintptr_t) i30 + input_stride); + uint32_t* o = (uint32_t*) output; + const size_t minus_output_stride = -output_stride; + + do { + const size_t rem = min(block_width - 1, 31); + const size_t oN_stride = rem * output_stride; + size_t bh = block_height; + for (; bh >= 32; bh -= 32) { + const HVX_Vector v5_0 = *((HVX_UVector *) i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); + const HVX_Vector v5_1 = *((HVX_UVector *) i1); i1 = (uint32_t*) ((uintptr_t) i1 + input_offset); + const HVX_Vector v5_2 = *((HVX_UVector *) i2); i2 = (uint32_t*) ((uintptr_t) i2 + input_offset); + const HVX_Vector v5_3 = *((HVX_UVector *) i3); i3 = (uint32_t*) ((uintptr_t) i3 + input_offset); + const HVX_Vector v5_4 = *((HVX_UVector *) i4); i4 = (uint32_t*) ((uintptr_t) i4 + input_offset); + const HVX_Vector v5_5 = *((HVX_UVector *) i5); i5 = (uint32_t*) ((uintptr_t) i5 + input_offset); + const HVX_Vector v5_6 = *((HVX_UVector *) i6); i6 = (uint32_t*) ((uintptr_t) i6 + input_offset); + const HVX_Vector v5_7 = *((HVX_UVector *) i7); i7 = (uint32_t*) ((uintptr_t) i7 + input_offset); + const HVX_Vector v5_8 = *((HVX_UVector *) i8); i8 = (uint32_t*) ((uintptr_t) i8 + input_offset); + const HVX_Vector v5_9 = *((HVX_UVector *) i9); i9 = (uint32_t*) ((uintptr_t) i9 + input_offset); + const HVX_Vector v5_10 = *((HVX_UVector *) i10); i10 = (uint32_t*) ((uintptr_t) i10 + input_offset); + const HVX_Vector v5_11 = *((HVX_UVector *) i11); i11 = (uint32_t*) ((uintptr_t) i11 + input_offset); + const HVX_Vector v5_12 = *((HVX_UVector *) i12); i12 = (uint32_t*) ((uintptr_t) i12 + input_offset); + const HVX_Vector v5_13 = *((HVX_UVector *) i13); i13 = (uint32_t*) ((uintptr_t) i13 + input_offset); + const HVX_Vector v5_14 = *((HVX_UVector *) i14); i14 = (uint32_t*) ((uintptr_t) i14 + input_offset); + const HVX_Vector v5_15 = *((HVX_UVector *) i15); i15 = (uint32_t*) ((uintptr_t) i15 + input_offset); + const HVX_Vector v5_16 = *((HVX_UVector *) i16); i16 = (uint32_t*) ((uintptr_t) i16 + input_offset); + const HVX_Vector v5_17 = *((HVX_UVector *) i17); i17 = (uint32_t*) ((uintptr_t) i17 + input_offset); + const HVX_Vector v5_18 = *((HVX_UVector *) i18); i18 = (uint32_t*) ((uintptr_t) i18 + input_offset); + const HVX_Vector v5_19 = *((HVX_UVector *) i19); i19 = (uint32_t*) ((uintptr_t) i19 + input_offset); + const HVX_Vector v5_20 = *((HVX_UVector *) i20); i20 = (uint32_t*) ((uintptr_t) i20 + input_offset); + const HVX_Vector v5_21 = *((HVX_UVector *) i21); i21 = (uint32_t*) ((uintptr_t) i21 + input_offset); + const HVX_Vector v5_22 = *((HVX_UVector *) i22); i22 = (uint32_t*) ((uintptr_t) i22 + input_offset); + const HVX_Vector v5_23 = *((HVX_UVector *) i23); i23 = (uint32_t*) ((uintptr_t) i23 + input_offset); + const HVX_Vector v5_24 = *((HVX_UVector *) i24); i24 = (uint32_t*) ((uintptr_t) i24 + input_offset); + const HVX_Vector v5_25 = *((HVX_UVector *) i25); i25 = (uint32_t*) ((uintptr_t) i25 + input_offset); + const HVX_Vector v5_26 = *((HVX_UVector *) i26); i26 = (uint32_t*) ((uintptr_t) i26 + input_offset); + const HVX_Vector v5_27 = *((HVX_UVector *) i27); i27 = (uint32_t*) ((uintptr_t) i27 + input_offset); + const HVX_Vector v5_28 = *((HVX_UVector *) i28); i28 = (uint32_t*) ((uintptr_t) i28 + input_offset); + const HVX_Vector v5_29 = *((HVX_UVector *) i29); i29 = (uint32_t*) ((uintptr_t) i29 + input_offset); + const HVX_Vector v5_30 = *((HVX_UVector *) i30); i30 = (uint32_t*) ((uintptr_t) i30 + input_offset); + const HVX_Vector v5_31 = *((HVX_UVector *) i31); i31 = (uint32_t*) ((uintptr_t) i31 + input_offset); + + int rt = -4; + const HVX_VectorPair v4_0 = Q6_W_vshuff_VVR(v5_1, v5_0, rt); + const HVX_VectorPair v4_1 = Q6_W_vshuff_VVR(v5_3, v5_2, rt); + const HVX_VectorPair v4_2 = Q6_W_vshuff_VVR(v5_5, v5_4, rt); + const HVX_VectorPair v4_3 = Q6_W_vshuff_VVR(v5_7, v5_6, rt); + const HVX_VectorPair v4_4 = Q6_W_vshuff_VVR(v5_9, v5_8, rt); + const HVX_VectorPair v4_5 = Q6_W_vshuff_VVR(v5_11, v5_10, rt); + const HVX_VectorPair v4_6 = Q6_W_vshuff_VVR(v5_13, v5_12, rt); + const HVX_VectorPair v4_7 = Q6_W_vshuff_VVR(v5_15, v5_14, rt); + const HVX_VectorPair v4_8 = Q6_W_vshuff_VVR(v5_17, v5_16, rt); + const HVX_VectorPair v4_9 = Q6_W_vshuff_VVR(v5_19, v5_18, rt); + const HVX_VectorPair v4_10 = Q6_W_vshuff_VVR(v5_21, v5_20, rt); + const HVX_VectorPair v4_11 = Q6_W_vshuff_VVR(v5_23, v5_22, rt); + const HVX_VectorPair v4_12 = Q6_W_vshuff_VVR(v5_25, v5_24, rt); + const HVX_VectorPair v4_13 = Q6_W_vshuff_VVR(v5_27, v5_26, rt); + const HVX_VectorPair v4_14 = Q6_W_vshuff_VVR(v5_29, v5_28, rt); + const HVX_VectorPair v4_15 = Q6_W_vshuff_VVR(v5_31, v5_30, rt); + + rt = rt << 1; + HVX_VectorPair v3_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_1), Q6_V_lo_W(v4_0), rt); + HVX_VectorPair v3_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_1), Q6_V_hi_W(v4_0), rt); + + HVX_VectorPair v3_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_3), Q6_V_lo_W(v4_2), rt); + HVX_VectorPair v3_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_3), Q6_V_hi_W(v4_2), rt); + + HVX_VectorPair v3_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_5), Q6_V_lo_W(v4_4), rt); + HVX_VectorPair v3_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_5), Q6_V_hi_W(v4_4), rt); + + HVX_VectorPair v3_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_7), Q6_V_lo_W(v4_6), rt); + HVX_VectorPair v3_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_7), Q6_V_hi_W(v4_6), rt); + + HVX_VectorPair v3_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_9), Q6_V_lo_W(v4_8), rt); + HVX_VectorPair v3_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_9), Q6_V_hi_W(v4_8), rt); + + HVX_VectorPair v3_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_11), Q6_V_lo_W(v4_10), rt); + HVX_VectorPair v3_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_11), Q6_V_hi_W(v4_10), rt); + + HVX_VectorPair v3_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_13), Q6_V_lo_W(v4_12), rt); + HVX_VectorPair v3_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_13), Q6_V_hi_W(v4_12), rt); + + HVX_VectorPair v3_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v4_15), Q6_V_lo_W(v4_14), rt); + HVX_VectorPair v3_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v4_15), Q6_V_hi_W(v4_14), rt); + + rt = rt << 1; + HVX_VectorPair v2_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_2), Q6_V_lo_W(v3_0), rt); + HVX_VectorPair v2_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_2), Q6_V_hi_W(v3_0), rt); + + HVX_VectorPair v2_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_3), Q6_V_lo_W(v3_1), rt); + HVX_VectorPair v2_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_3), Q6_V_hi_W(v3_1), rt); + + HVX_VectorPair v2_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_6), Q6_V_lo_W(v3_4), rt); + HVX_VectorPair v2_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_6), Q6_V_hi_W(v3_4), rt); + + HVX_VectorPair v2_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_7), Q6_V_lo_W(v3_5), rt); + HVX_VectorPair v2_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_7), Q6_V_hi_W(v3_5), rt); + + HVX_VectorPair v2_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_10), Q6_V_lo_W(v3_8), rt); + HVX_VectorPair v2_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_10), Q6_V_hi_W(v3_8), rt); + + HVX_VectorPair v2_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_11), Q6_V_lo_W(v3_9), rt); + HVX_VectorPair v2_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_11), Q6_V_hi_W(v3_9), rt); + + HVX_VectorPair v2_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_14), Q6_V_lo_W(v3_12), rt); + HVX_VectorPair v2_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_14), Q6_V_hi_W(v3_12), rt); + + HVX_VectorPair v2_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v3_15), Q6_V_lo_W(v3_13), rt); + HVX_VectorPair v2_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v3_15), Q6_V_hi_W(v3_13), rt); + + rt = rt << 1; + HVX_VectorPair v1_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_4), Q6_V_lo_W(v2_0), rt); + HVX_VectorPair v1_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_4), Q6_V_hi_W(v2_0), rt); + + HVX_VectorPair v1_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_5), Q6_V_lo_W(v2_1), rt); + HVX_VectorPair v1_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_5), Q6_V_hi_W(v2_1), rt); + + HVX_VectorPair v1_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_6), Q6_V_lo_W(v2_2), rt); + HVX_VectorPair v1_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_6), Q6_V_hi_W(v2_2), rt); + + HVX_VectorPair v1_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_7), Q6_V_lo_W(v2_3), rt); + HVX_VectorPair v1_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_7), Q6_V_hi_W(v2_3), rt); + + HVX_VectorPair v1_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_12), Q6_V_lo_W(v2_8), rt); + HVX_VectorPair v1_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_12), Q6_V_hi_W(v2_8), rt); + + HVX_VectorPair v1_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_13), Q6_V_lo_W(v2_9), rt); + HVX_VectorPair v1_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_13), Q6_V_hi_W(v2_9), rt); + + HVX_VectorPair v1_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_14), Q6_V_lo_W(v2_10), rt); + HVX_VectorPair v1_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_14), Q6_V_hi_W(v2_10), rt); + + HVX_VectorPair v1_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_15), Q6_V_lo_W(v2_11), rt); + HVX_VectorPair v1_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_15), Q6_V_hi_W(v2_11), rt); + + rt = rt << 1; + HVX_VectorPair v0_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_8), Q6_V_lo_W(v1_0), rt); + HVX_VectorPair v0_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_8), Q6_V_hi_W(v1_0), rt); + + HVX_VectorPair v0_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_9), Q6_V_lo_W(v1_1), rt); + HVX_VectorPair v0_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_9), Q6_V_hi_W(v1_1), rt); + + HVX_VectorPair v0_4 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_10), Q6_V_lo_W(v1_2), rt); + HVX_VectorPair v0_5 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_10), Q6_V_hi_W(v1_2), rt); + + HVX_VectorPair v0_6 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_11), Q6_V_lo_W(v1_3), rt); + HVX_VectorPair v0_7 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_11), Q6_V_hi_W(v1_3), rt); + + HVX_VectorPair v0_8 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_12), Q6_V_lo_W(v1_4), rt); + HVX_VectorPair v0_9 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_12), Q6_V_hi_W(v1_4), rt); + + HVX_VectorPair v0_10 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_13), Q6_V_lo_W(v1_5), rt); + HVX_VectorPair v0_11 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_13), Q6_V_hi_W(v1_5), rt); + + HVX_VectorPair v0_12 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_14), Q6_V_lo_W(v1_6), rt); + HVX_VectorPair v0_13 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_14), Q6_V_hi_W(v1_6), rt); + + HVX_VectorPair v0_14 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_15), Q6_V_lo_W(v1_7), rt); + HVX_VectorPair v0_15 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_15), Q6_V_hi_W(v1_7), rt); + + uint32_t *oN = (uint32_t*) ((uintptr_t) o + oN_stride); + switch (rem) { + case 31: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_15)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 30: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_15)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 29: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_14)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 28: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_14)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 27: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_13)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 26: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_13)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 25: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_12)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 24: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_12)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 23: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_11)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 22: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_11)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 21: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_10)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 20: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_10)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 19: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_9)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 18: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_9)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 17: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_8)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 16: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_8)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 15: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_7)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 14: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_7)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 13: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_6)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 12: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_6)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 11: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_5)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 10: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_5)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 9: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_4)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 8: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_4)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 7: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_3)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 6: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_3)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 5: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_2)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 4: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_2)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 3: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_1)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 2: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_1)); + oN = (uint32_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 1: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_0)); + XNN_FALLTHROUGH + case 0: + xnn_storeu_f32(o, Q6_V_lo_W(v0_0)); + o = (uint32_t*) ((uintptr_t) o + tile_hbytes); + break; + default: + XNN_UNREACHABLE; + } + } + + + i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); + i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + i4 = (const uint32_t*) ((uintptr_t) i3 + input_stride); + i5 = (const uint32_t*) ((uintptr_t) i4 + input_stride); + i6 = (const uint32_t*) ((uintptr_t) i5 + input_stride); + i7 = (const uint32_t*) ((uintptr_t) i6 + input_stride); + i8 = (const uint32_t*) ((uintptr_t) i7 + input_stride); + i9 = (const uint32_t*) ((uintptr_t) i8 + input_stride); + i10 = (const uint32_t*) ((uintptr_t) i9 + input_stride); + i11 = (const uint32_t*) ((uintptr_t) i10 + input_stride); + i12 = (const uint32_t*) ((uintptr_t) i11 + input_stride); + i13 = (const uint32_t*) ((uintptr_t) i12 + input_stride); + i14 = (const uint32_t*) ((uintptr_t) i13 + input_stride); + i15 = (const uint32_t*) ((uintptr_t) i14 + input_stride); + i16 = (const uint32_t*) ((uintptr_t) i15 + input_stride); + i17 = (const uint32_t*) ((uintptr_t) i16 + input_stride); + i18 = (const uint32_t*) ((uintptr_t) i17 + input_stride); + i19 = (const uint32_t*) ((uintptr_t) i18 + input_stride); + i20 = (const uint32_t*) ((uintptr_t) i19 + input_stride); + i21 = (const uint32_t*) ((uintptr_t) i20 + input_stride); + i22 = (const uint32_t*) ((uintptr_t) i21 + input_stride); + i23 = (const uint32_t*) ((uintptr_t) i22 + input_stride); + i24 = (const uint32_t*) ((uintptr_t) i23 + input_stride); + i25 = (const uint32_t*) ((uintptr_t) i24 + input_stride); + i26 = (const uint32_t*) ((uintptr_t) i25 + input_stride); + i27 = (const uint32_t*) ((uintptr_t) i26 + input_stride); + i28 = (const uint32_t*) ((uintptr_t) i27 + input_stride); + i29 = (const uint32_t*) ((uintptr_t) i28 + input_stride); + i30 = (const uint32_t*) ((uintptr_t) i29 + input_stride); + i31 = (const uint32_t*) ((uintptr_t) i30 + input_stride); + o = (uint32_t*) ((uintptr_t) o + output_reset); + block_width = doz(block_width, tile_width); + } while (block_width != 0); +} + + diff --git a/src/x32-transposec/gen/x32-transposec-4x32-multi-multi-hvx.c b/src/x32-transposec/gen/x32-transposec-4x32-multi-multi-hvx.c new file mode 100644 index 00000000000..ccf82f1ddbe --- /dev/null +++ b/src/x32-transposec/gen/x32-transposec-4x32-multi-multi-hvx.c @@ -0,0 +1,91 @@ +// Auto-generated file. Do not edit! +// Template: src/x32-transposec/hvx.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/math.h" +#include "xnnpack/transpose.h" + +void xnn_x32_transposec_ukernel__4x32_multi_multi_hvx( + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) XNN_OOB_READS +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint32_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint32_t)); + + const size_t tile_height = 4; + const size_t tile_width = 32; + const size_t tile_hbytes = tile_height * sizeof(uint32_t); + const size_t tile_wbytes = tile_width * sizeof(uint32_t); + const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; + const size_t input_offset = tile_height * input_stride; + const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t); + + const uint32_t* i0 = input; + const uint32_t* i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + const uint32_t* i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + const uint32_t* i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + uint32_t* o0 = (uint32_t*) output; + uint32_t* o1 = (uint32_t*) ((uintptr_t) o0 + output_stride); + uint32_t* o2 = (uint32_t*) ((uintptr_t) o1 + output_stride); + uint32_t* o3 = (uint32_t*) ((uintptr_t) o2 + output_stride); + const size_t minus_output_stride = -output_stride; + + do { + if XNN_UNPREDICTABLE(block_width < 2) { + o1 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 2) { + o2 = o0; + } + if XNN_UNPREDICTABLE(block_width < 4) { + o3 = o0; + } + size_t bh = block_height; + for (; bh >= 4; bh -= 4) { + const HVX_Vector v2_0 = *((HVX_UVector *) i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); + const HVX_Vector v2_1 = *((HVX_UVector *) i1); i1 = (uint32_t*) ((uintptr_t) i1 + input_offset); + const HVX_Vector v2_2 = *((HVX_UVector *) i2); i2 = (uint32_t*) ((uintptr_t) i2 + input_offset); + const HVX_Vector v2_3 = *((HVX_UVector *) i3); i3 = (uint32_t*) ((uintptr_t) i3 + input_offset); + + int rt = -4; + const HVX_VectorPair v1_0 = Q6_W_vshuff_VVR(v2_1, v2_0, rt); + const HVX_VectorPair v1_1 = Q6_W_vshuff_VVR(v2_3, v2_2, rt); + + rt = rt << 1; + HVX_VectorPair v0_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_1), Q6_V_lo_W(v1_0), rt); + HVX_VectorPair v0_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_1), Q6_V_hi_W(v1_0), rt); + + xnn_storeu_f32(o3, Q6_V_hi_W(v0_1)); o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); + xnn_storeu_f32(o2, Q6_V_lo_W(v0_1)); o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); + xnn_storeu_f32(o1, Q6_V_hi_W(v0_0)); o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); + xnn_storeu_f32(o0, Q6_V_lo_W(v0_0)); o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); + } + + + i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); + i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + o0 = (uint32_t*) ((uintptr_t) o0 + output_reset); + o1 = (uint32_t*) ((uintptr_t) o1 + output_reset); + o2 = (uint32_t*) ((uintptr_t) o2 + output_reset); + o3 = (uint32_t*) ((uintptr_t) o3 + output_reset); + block_width = doz(block_width, tile_width); + } while (block_width != 0); +} + + diff --git a/src/x32-transposec/gen/x32-transposec-8x32-multi-multi-hvx.c b/src/x32-transposec/gen/x32-transposec-8x32-multi-multi-hvx.c new file mode 100644 index 00000000000..bd69bf46937 --- /dev/null +++ b/src/x32-transposec/gen/x32-transposec-8x32-multi-multi-hvx.c @@ -0,0 +1,139 @@ +// Auto-generated file. Do not edit! +// Template: src/x32-transposec/hvx.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/math.h" +#include "xnnpack/transpose.h" + +void xnn_x32_transposec_ukernel__8x32_multi_multi_hvx( + const uint32_t* input, + uint32_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) XNN_OOB_READS +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint32_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint32_t)); + + const size_t tile_height = 8; + const size_t tile_width = 32; + const size_t tile_hbytes = tile_height * sizeof(uint32_t); + const size_t tile_wbytes = tile_width * sizeof(uint32_t); + const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; + const size_t input_offset = tile_height * input_stride; + const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint32_t); + + const uint32_t* i0 = input; + const uint32_t* i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + const uint32_t* i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + const uint32_t* i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + const uint32_t* i4 = (const uint32_t*) ((uintptr_t) i3 + input_stride); + const uint32_t* i5 = (const uint32_t*) ((uintptr_t) i4 + input_stride); + const uint32_t* i6 = (const uint32_t*) ((uintptr_t) i5 + input_stride); + const uint32_t* i7 = (const uint32_t*) ((uintptr_t) i6 + input_stride); + uint32_t* o0 = (uint32_t*) output; + uint32_t* o1 = (uint32_t*) ((uintptr_t) o0 + output_stride); + uint32_t* o2 = (uint32_t*) ((uintptr_t) o1 + output_stride); + uint32_t* o3 = (uint32_t*) ((uintptr_t) o2 + output_stride); + uint32_t* o4 = (uint32_t*) ((uintptr_t) o3 + output_stride); + uint32_t* o5 = (uint32_t*) ((uintptr_t) o4 + output_stride); + uint32_t* o6 = (uint32_t*) ((uintptr_t) o5 + output_stride); + uint32_t* o7 = (uint32_t*) ((uintptr_t) o6 + output_stride); + const size_t minus_output_stride = -output_stride; + + do { + if XNN_UNPREDICTABLE(block_width < 2) { + o1 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 2) { + o2 = o0; + } + if XNN_UNPREDICTABLE(block_width < 4) { + o3 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 4) { + o4 = o0; + } + if XNN_UNPREDICTABLE(block_width < 6) { + o5 = o0; + } + if XNN_UNPREDICTABLE(block_width <= 6) { + o6 = o0; + } + if XNN_UNPREDICTABLE(block_width < 8) { + o7 = o0; + } + size_t bh = block_height; + for (; bh >= 8; bh -= 8) { + const HVX_Vector v3_0 = *((HVX_UVector *) i0); i0 = (uint32_t*) ((uintptr_t) i0 + input_offset); + const HVX_Vector v3_1 = *((HVX_UVector *) i1); i1 = (uint32_t*) ((uintptr_t) i1 + input_offset); + const HVX_Vector v3_2 = *((HVX_UVector *) i2); i2 = (uint32_t*) ((uintptr_t) i2 + input_offset); + const HVX_Vector v3_3 = *((HVX_UVector *) i3); i3 = (uint32_t*) ((uintptr_t) i3 + input_offset); + const HVX_Vector v3_4 = *((HVX_UVector *) i4); i4 = (uint32_t*) ((uintptr_t) i4 + input_offset); + const HVX_Vector v3_5 = *((HVX_UVector *) i5); i5 = (uint32_t*) ((uintptr_t) i5 + input_offset); + const HVX_Vector v3_6 = *((HVX_UVector *) i6); i6 = (uint32_t*) ((uintptr_t) i6 + input_offset); + const HVX_Vector v3_7 = *((HVX_UVector *) i7); i7 = (uint32_t*) ((uintptr_t) i7 + input_offset); + + int rt = -4; + const HVX_VectorPair v2_0 = Q6_W_vshuff_VVR(v3_1, v3_0, rt); + const HVX_VectorPair v2_1 = Q6_W_vshuff_VVR(v3_3, v3_2, rt); + const HVX_VectorPair v2_2 = Q6_W_vshuff_VVR(v3_5, v3_4, rt); + const HVX_VectorPair v2_3 = Q6_W_vshuff_VVR(v3_7, v3_6, rt); + + rt = rt << 1; + HVX_VectorPair v1_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_1), Q6_V_lo_W(v2_0), rt); + HVX_VectorPair v1_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_1), Q6_V_hi_W(v2_0), rt); + + HVX_VectorPair v1_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v2_3), Q6_V_lo_W(v2_2), rt); + HVX_VectorPair v1_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v2_3), Q6_V_hi_W(v2_2), rt); + + rt = rt << 1; + HVX_VectorPair v0_0 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_2), Q6_V_lo_W(v1_0), rt); + HVX_VectorPair v0_1 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_2), Q6_V_hi_W(v1_0), rt); + + HVX_VectorPair v0_2 = Q6_W_vshuff_VVR(Q6_V_lo_W(v1_3), Q6_V_lo_W(v1_1), rt); + HVX_VectorPair v0_3 = Q6_W_vshuff_VVR(Q6_V_hi_W(v1_3), Q6_V_hi_W(v1_1), rt); + + xnn_storeu_f32(o7, Q6_V_hi_W(v0_3)); o7 = (uint32_t*) ((uintptr_t) o7 + tile_hbytes); + xnn_storeu_f32(o6, Q6_V_lo_W(v0_3)); o6 = (uint32_t*) ((uintptr_t) o6 + tile_hbytes); + xnn_storeu_f32(o5, Q6_V_hi_W(v0_2)); o5 = (uint32_t*) ((uintptr_t) o5 + tile_hbytes); + xnn_storeu_f32(o4, Q6_V_lo_W(v0_2)); o4 = (uint32_t*) ((uintptr_t) o4 + tile_hbytes); + xnn_storeu_f32(o3, Q6_V_hi_W(v0_1)); o3 = (uint32_t*) ((uintptr_t) o3 + tile_hbytes); + xnn_storeu_f32(o2, Q6_V_lo_W(v0_1)); o2 = (uint32_t*) ((uintptr_t) o2 + tile_hbytes); + xnn_storeu_f32(o1, Q6_V_hi_W(v0_0)); o1 = (uint32_t*) ((uintptr_t) o1 + tile_hbytes); + xnn_storeu_f32(o0, Q6_V_lo_W(v0_0)); o0 = (uint32_t*) ((uintptr_t) o0 + tile_hbytes); + } + + + i0 = (const uint32_t*) ((uintptr_t) i0 + input_reset); + i1 = (const uint32_t*) ((uintptr_t) i0 + input_stride); + i2 = (const uint32_t*) ((uintptr_t) i1 + input_stride); + i3 = (const uint32_t*) ((uintptr_t) i2 + input_stride); + i4 = (const uint32_t*) ((uintptr_t) i3 + input_stride); + i5 = (const uint32_t*) ((uintptr_t) i4 + input_stride); + i6 = (const uint32_t*) ((uintptr_t) i5 + input_stride); + i7 = (const uint32_t*) ((uintptr_t) i6 + input_stride); + o0 = (uint32_t*) ((uintptr_t) o0 + output_reset); + o1 = (uint32_t*) ((uintptr_t) o1 + output_reset); + o2 = (uint32_t*) ((uintptr_t) o2 + output_reset); + o3 = (uint32_t*) ((uintptr_t) o3 + output_reset); + o4 = (uint32_t*) ((uintptr_t) o4 + output_reset); + o5 = (uint32_t*) ((uintptr_t) o5 + output_reset); + o6 = (uint32_t*) ((uintptr_t) o6 + output_reset); + o7 = (uint32_t*) ((uintptr_t) o7 + output_reset); + block_width = doz(block_width, tile_width); + } while (block_width != 0); +} + + diff --git a/src/x32-transposec/hvx.c.in b/src/x32-transposec/hvx.c.in new file mode 100644 index 00000000000..98972e18f9c --- /dev/null +++ b/src/x32-transposec/hvx.c.in @@ -0,0 +1,182 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +$import math +$assert IN_PTRS in ["MULTI", "REUSE"] +$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"] +$assert SIZE in [8, 16, 32] +$assert TILE_HEIGHT in [2, 4, 8, 16, 32] +$TILE_WIDTH = int(1024/SIZE) +$NUM_ITERS = int(math.log2(TILE_HEIGHT)) +$SUFFIX = '' +$TILE_SIZE = TILE_HEIGHT +$VECTOR_SIZE = int(128) +$NUM_D_REGISTERS=int(VECTOR_SIZE/64) + +#include + +#include + +#include "xnnpack/common.h" +#include "xnnpack/math.h" +#include "xnnpack/transpose.h" + +void xnn_x${SIZE}_transposec_ukernel__${TILE_HEIGHT}x${TILE_WIDTH}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_hvx( + const uint${SIZE}_t* input, + uint${SIZE}_t* output, + size_t input_stride, + size_t output_stride, + size_t block_width, + size_t block_height) XNN_OOB_READS +{ + assert(block_width == 1 || output_stride >= block_height * sizeof(uint${SIZE}_t)); + assert(block_height == 1 || input_stride >= block_width * sizeof(uint${SIZE}_t)); + + const size_t tile_height = ${TILE_HEIGHT}; + const size_t tile_width = ${TILE_WIDTH}; + const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t); + const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t); + const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride; + $if IN_PTRS == "MULTI": + const size_t input_offset = tile_height * input_stride; + $if OUT_PTRS in ["MOV", "DEC"]: + const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes; + $else: + const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t); + + $if IN_PTRS == "MULTI": + const uint${SIZE}_t* i0 = input; + $for N in range(1, TILE_HEIGHT): + const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); + $else: + const uint${SIZE}_t* i0 = input; + $if OUT_PTRS == "MULTI": + uint${SIZE}_t* o0 = (uint${SIZE}_t*) output; + $for N in range(1, TILE_HEIGHT): + uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride); + $elif OUT_PTRS == "SWITCH": + uint${SIZE}_t* o = (uint${SIZE}_t*) output; + $else: + uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes); + const size_t minus_output_stride = -output_stride; + + do { + $if OUT_PTRS == "MULTI": + if XNN_UNPREDICTABLE(block_width < 2) { + o1 = o0; + } + $for N in range(2, TILE_HEIGHT, 2): + if XNN_UNPREDICTABLE(block_width <= ${N}) { + o${N} = o0; + } + if XNN_UNPREDICTABLE(block_width < ${N+2}) { + o${N+1} = o0; + } + $elif OUT_PTRS in ["MOV", "DEC"]: + const size_t rem = min(block_width - 1, ${TILE_HEIGHT - 1}); + const size_t oN_stride = rem * output_stride; + const size_t oN_offset = oN_stride + tile_hbytes; + $else: + const size_t rem = min(block_width - 1, ${TILE_HEIGHT - 1}); + const size_t oN_stride = rem * output_stride; + size_t bh = block_height; + for (; bh >= ${TILE_HEIGHT}; bh -= ${TILE_HEIGHT}) { + $for N in range(TILE_HEIGHT): + $if IN_PTRS == "REUSE": + const HVX_Vector v${NUM_ITERS}_${N} = *((HVX_UVector *) i0); i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride); + $else: + const HVX_Vector v${NUM_ITERS}_${N} = *((HVX_UVector *) i${N}); i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset); + + int rt = -${int(SIZE/8)}; + $for N in range(TILE_HEIGHT >> 1): + const HVX_VectorPair v${NUM_ITERS-1}_${N} = Q6_W_vshuff_VVR(v${NUM_ITERS}_${2*N+1}, v${NUM_ITERS}_${2*N}, rt); + + $for M in range(1, NUM_ITERS): + rt = rt << 1; + $NUM_ITERS_O = int(math.pow(2, M - 1)) + $for N in range(0, TILE_HEIGHT >> 1, NUM_ITERS_O*2): + $for O in range(NUM_ITERS_O): + $PREV_PAIR = NUM_ITERS-M + HVX_VectorPair v${PREV_PAIR-1}_${N+O*2} = Q6_W_vshuff_VVR(Q6_V_lo_W(v${PREV_PAIR}_${N+O+NUM_ITERS_O}), Q6_V_lo_W(v${PREV_PAIR}_${N+O}), rt); + HVX_VectorPair v${PREV_PAIR-1}_${N+O*2+1} = Q6_W_vshuff_VVR(Q6_V_hi_W(v${PREV_PAIR}_${N+O+NUM_ITERS_O}), Q6_V_hi_W(v${PREV_PAIR}_${N+O}), rt); + + $if OUT_PTRS == "SWITCH": + uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride); + switch (rem) { + $for N in reversed(range(2, TILE_HEIGHT, 2)): + case ${N+1}: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_${int(N/2)})); + oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case ${N}: + xnn_storeu_f32(oN, Q6_V_lo_W(v0_${int(N/2)})); + oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride); + XNN_FALLTHROUGH + case 1: + xnn_storeu_f32(oN, Q6_V_hi_W(v0_0)); + XNN_FALLTHROUGH + case 0: + xnn_storeu_f32(o, Q6_V_lo_W(v0_0)); + o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); + break; + default: + XNN_UNREACHABLE; + } + $elif OUT_PTRS in ["MOV", "DEC"]: + o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset); + xnn_storeu_f32(o, Q6_V_hi_W(v0_${(TILE_HEIGHT-1)>>1})); + $if OUT_PTRS == "MOV": + uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); + $for N in reversed(range(2, TILE_SIZE, 2)): + if XNN_UNPREDICTABLE(block_width > ${N+1}) { + $if OUT_PTRS == "MOV": + o = oN; + $else: + o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_${N>>1})); + $if OUT_PTRS == "MOV": + oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width >= ${N}) { + $if OUT_PTRS == "MOV": + o = oN; + $else: + o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); + } + xnn_storeu_f32(o, Q6_V_hi_W(v0_${(N-1)>>1})); + $if OUT_PTRS == "MOV": + oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); + if XNN_UNPREDICTABLE(block_width > 1) { + $if OUT_PTRS == "MOV": + o = oN; + $else: + o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride); + } + xnn_storeu_f32(o, Q6_V_lo_W(v0_0)); + $else: + $for N in reversed(range(0, TILE_HEIGHT, 2)): + xnn_storeu_f32(o${N+1}, Q6_V_hi_W(v0_${int(N/2)})); o${N+1} = (uint${SIZE}_t*) ((uintptr_t) o${N+1} + tile_hbytes); + xnn_storeu_f32(o${N}, Q6_V_lo_W(v0_${int(N/2)})); o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes); + } + $if OUT_PTRS in ["MOV", "DEC"]: + o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes); + + + $if IN_PTRS == "MULTI": + i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); + $for N in range(1, TILE_HEIGHT): + i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride); + $else: + i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset); + $if OUT_PTRS == "MULTI": + o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset); + $for N in range(1, TILE_HEIGHT): + o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset); + $else: + o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset); + block_width = doz(block_width, tile_width); + } while (block_width != 0); +} + + diff --git a/src/x32-transposec/x32-transposec.h b/src/x32-transposec/x32-transposec.h index 2538b9726cd..c94a8d90e93 100644 --- a/src/x32-transposec/x32-transposec.h +++ b/src/x32-transposec/x32-transposec.h @@ -74,3 +74,6 @@ XNN_TRANSPOSE_UKERNEL(xnn_arch_arm_neon, xnn_x32_transposec_ukernel__4x4_reuse_s XNN_TRANSPOSE_UKERNEL(xnn_arch_arm_neon, xnn_x32_transposec_ukernel__4x4_aarch64_neon_tbl128, 32, uint32_t, 4, 4) #endif // XNN_ARCH_ARM64 +#if XNN_ARCH_HEXAGON && XNN_ENABLE_HVX +XNN_TRANSPOSE_UKERNEL(0,xnn_x32_transposec_ukernel__32x32_multi_multi_hvx, 32, uint32_t, 32, 32) +#endif // XNN_ARCH_HEXAGON && XNN_ENABLE_HVX \ No newline at end of file