From c772deadb7c8491f08103a4aa33bd98c264e2097 Mon Sep 17 00:00:00 2001 From: veejay <> Date: Mon, 18 Sep 2023 15:01:55 +0200 Subject: [PATCH] WIP memset neon 32 --- veejay-current/veejay-core/libvjmem/memcpy.c | 36 +++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/veejay-current/veejay-core/libvjmem/memcpy.c b/veejay-current/veejay-core/libvjmem/memcpy.c index 3a988c01..8d8d0af2 100644 --- a/veejay-current/veejay-core/libvjmem/memcpy.c +++ b/veejay-current/veejay-core/libvjmem/memcpy.c @@ -1945,6 +1945,40 @@ void memset_asimd_64(uint8_t *dst, uint8_t value, size_t size) { remaining_bytes--; } } + +void memset_asimd_32(uint8_t *dst, uint8_t value, size_t size) { + uint8x16_t value_v = vdupq_n_u8(value); + + size_t num_blocks = size / 32; + size_t remaining_bytes = size % 32; + + for (size_t i = 0; i < num_blocks; i++) { + vst1q_u8(dst, value_v); + dst += 16; + vst1q_u8(dst, value_v); + dst += 16; + } + + while (remaining_bytes >= 16) { + vst1q_u8(dst, value_v); + dst += 16; + remaining_bytes -= 16; + } + + while (remaining_bytes >= 8) { + uint64x1_t value_u64 = vdup_n_u64(*((uint64_t*)&value)); + vst1_u8(dst, vreinterpret_u8_u64(value_u64)); + dst += 8; + remaining_bytes -= 8; + } + + while (remaining_bytes > 0) { + *dst = value; + dst++; + remaining_bytes--; + } +} + #endif static struct { @@ -2036,7 +2070,7 @@ static struct { { "Advanced SIMD memset()", (void*) memset_asimd, 0, AV_CPU_FLAG_ARMV8 }, { "Advanced SIMD memset() v4", (void*) memset_asimd_v4, 0, AV_CPU_FLAG_ARMV8 }, { "Advanced SIMD memset() with line size of 64", (void*) memset_asimd_64, 0, AV_CPU_FLAG_ARMV8 }, - + { "Advanced SIMD memset() with line size of 32", (void*) memset_asimd_64, 0, AV_CPU_FLAG_ARMV8 }, // { "Advanced SIMD memset() v3", (void*) memset_asimd_v3, 0, AV_CPU_FLAG_ARMV8 }, // { "Advanced SIMD memset() v2", (void*) memset_asimd_v2, 0, AV_CPU_FLAG_ARMV8 },