From 057ce7a549c4ace67f5bcf94538e372005f23bde Mon Sep 17 00:00:00 2001 From: veejay <> Date: Mon, 18 Sep 2023 01:29:36 +0200 Subject: [PATCH] ARMv8-a RPI4 memset, cacheline size WIP --- veejay-current/veejay-core/libvjmem/memcpy.c | 30 +++++++++++++------- veejay-current/veejay-core/libvjmem/vj-x86.c | 2 +- veejay-current/veejay-core/libvjmem/vjmem.h | 1 + 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/veejay-current/veejay-core/libvjmem/memcpy.c b/veejay-current/veejay-core/libvjmem/memcpy.c index 37e40111..1ff76a60 100644 --- a/veejay-current/veejay-core/libvjmem/memcpy.c +++ b/veejay-current/veejay-core/libvjmem/memcpy.c @@ -1756,28 +1756,38 @@ static void fast_memset(void * to, int val, size_t len) #ifdef HAVE_ARM_ASIMD void memset_asimd_v3(void *dst, uint8_t val, size_t len) { - uint8x16_t value = vdupq_n_u8(val); + uint8x16_t value = vdupq_n_u8(val); uint8_t *dst_bytes = (uint8_t *)dst; - size_t num_blocks = len / 16; + + const int CACHE_LINE_SIZE = cpu_cache_size(); + size_t offset = (CACHE_LINE_SIZE - ((uintptr_t)dst_bytes % CACHE_LINE_SIZE)) % CACHE_LINE_SIZE; + + for (size_t i = 0; i < offset; i++) { + *dst_bytes++ = val; + len--; + } value = vld1q_u8(&val); - for (size_t i = 0; i < num_blocks; i++) { - uint8x16_t dst_data = vld1q_u8(dst_bytes); + size_t num_lines = len / CACHE_LINE_SIZE; - dst_data = vorrq_u8(dst_data, value); - - vst1q_u8(dst_bytes, dst_data); - dst_bytes += 16; + for (size_t i = 0; i < num_lines; i++) { + for (size_t j = 0; j < CACHE_LINE_SIZE / 16; j++) { + uint8x16_t dst_data = vld1q_u8(dst_bytes); + dst_data = vorrq_u8(dst_data, value); + vst1q_u8(dst_bytes, dst_data); + dst_bytes += 16; + } } - size_t remaining_bytes = len % 16; - + size_t remaining_bytes = len % CACHE_LINE_SIZE; for (size_t i = 0; i < remaining_bytes; i++) { *dst_bytes++ = val; } } + + void memset_asimd(void *dst, uint8_t val, size_t len) { uint8x16_t value = vdupq_n_u8(val); uint8_t *dst_bytes = (uint8_t *)dst; diff --git a/veejay-current/veejay-core/libvjmem/vj-x86.c b/veejay-current/veejay-core/libvjmem/vj-x86.c index 8fe7afb8..8935a33d 100644 --- a/veejay-current/veejay-core/libvjmem/vj-x86.c +++ b/veejay-current/veejay-core/libvjmem/vj-x86.c @@ -33,7 +33,7 @@ extern void yuyv_plane_init(); extern void benchmark_tasks(int n_tasks, long n_frames, int w, int h); extern void init_parallel_tasks(int n_tasks); static int MEM_ALIGNMENT_SIZE = 0; -static int CACHE_LINE_SIZE = 64; +extern int CACHE_LINE_SIZE = 64; static int has_cpuid(void) diff --git a/veejay-current/veejay-core/libvjmem/vjmem.h b/veejay-current/veejay-core/libvjmem/vjmem.h index 24953d08..1f19ed10 100644 --- a/veejay-current/veejay-core/libvjmem/vjmem.h +++ b/veejay-current/veejay-core/libvjmem/vjmem.h @@ -65,4 +65,5 @@ extern void *vj_simple_pool_alloc( void *ptr, size_t s ); extern void *vj_simple_pool_init( size_t s ); extern void vj_simple_pool_reset( void *ptr ); extern void vj_mem_destroy(); + #endif