Skip to content

Commit

Permalink
Add temporary multiload/store intrinsics
Browse files Browse the repository at this point in the history
  • Loading branch information
stellar-aria committed Dec 11, 2024
1 parent 2e6c77a commit f5bd078
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 6 deletions.
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_path(SET SHARED_INCLUDE ${CMAKE_CURRENT_LIST_DIR})

target_sources(deluge PUBLIC main.c resetprg.c c_lib_alternatives.S sys_stubs.c)
target_sources(deluge PUBLIC main.c resetprg.c c_lib_alternatives.S sys_stubs.c memmove.c)

add_subdirectory(OSLikeStuff)
add_subdirectory(deluge)
Expand Down
2 changes: 1 addition & 1 deletion src/mem_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ extern "C" {
// shenanigans
void* memset(void*, int, size_t);
void* memcpy(void* dest, const void* src, size_t n);
void* memmove(void* dest, const void* src, size_t n);
int strcmp(const char* str1, const char* str2);
void* memmove(void* dst, const void* src, size_t len);
#ifdef __cplusplus
}
#endif
34 changes: 30 additions & 4 deletions src/deluge/dsp/memmove.c → src/memmove.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,35 @@
#include <stdint.h>
#include <stdlib.h>

void* my_memmove(void* dst, const void* src, size_t len) {
[[gnu::always_inline]] inline uint8x16x2_t vld1q_u8_x2(unsigned char const* ptr) {
uint8x16x2_t output;
output.val[0] = vld1q_u8(ptr);
output.val[1] = vld1q_u8(ptr + 16);
return output;
}

[[gnu::always_inline]] inline uint8x16x4_t vld1q_u8_x4(unsigned char const* ptr) {
uint8x16x4_t output;
output.val[0] = vld1q_u8(ptr);
output.val[1] = vld1q_u8(ptr + 16);
output.val[0] = vld1q_u8(ptr + 32);
output.val[0] = vld1q_u8(ptr + 48);
return output;
}

[[gnu::always_inline]] inline void vst1q_u8_x2(unsigned char* ptr, uint8x16x2_t output) {
vst1q_u8(ptr, output.val[0]);
vst1q_u8(ptr + 16, output.val[1]);
}

[[gnu::always_inline]] inline void vst1q_u8_x4(unsigned char* ptr, uint8x16x4_t output) {
vst1q_u8(ptr, output.val[0]);
vst1q_u8(ptr + 16, output.val[1]);
vst1q_u8(ptr + 32, output.val[2]);
vst1q_u8(ptr + 48, output.val[3]);
}

void* memmove(void* dst, const void* src, size_t len) {
ptrdiff_t result;
asm("sub %0, %1, %2" : "=r"(result) : "r"(dst), "r"(src));
if (abs(result) >= (ptrdiff_t)len) {
Expand Down Expand Up @@ -51,21 +79,19 @@ void* my_memmove(void* dst, const void* src, size_t len) {
vst1q_u8(d, vld1q_u8(s));
}

// quadword x2
if (len % 64) {
s -= 32;
d -= 32;
vst1q_u8_x2(d, vld1q_u8_x2(s));
}

// quadword x4
if (len % 128) {
s -= 64;
d -= 64;
vst1q_u8_x4(d, vld1q_u8_x4(s));
}

// quadword x8
// max number of quadwords is 16
while ((intptr_t)d > (intptr_t)dst) {
s -= 64;
uint8x16x4_t ld1 = vld1q_u8_x4(s);
Expand Down

0 comments on commit f5bd078

Please sign in to comment.