ARMv8-a RPI4 memset, cacheline size WIP

This commit is contained in:
veejay
2023-09-18 01:29:36 +02:00
parent c796c99f00
commit 057ce7a549
3 changed files with 22 additions and 11 deletions

View File

@@ -1756,28 +1756,38 @@ static void fast_memset(void * to, int val, size_t len)
#ifdef HAVE_ARM_ASIMD #ifdef HAVE_ARM_ASIMD
void memset_asimd_v3(void *dst, uint8_t val, size_t len) { void memset_asimd_v3(void *dst, uint8_t val, size_t len) {
uint8x16_t value = vdupq_n_u8(val); uint8x16_t value = vdupq_n_u8(val);
uint8_t *dst_bytes = (uint8_t *)dst; uint8_t *dst_bytes = (uint8_t *)dst;
size_t num_blocks = len / 16;
const int CACHE_LINE_SIZE = cpu_cache_size();
size_t offset = (CACHE_LINE_SIZE - ((uintptr_t)dst_bytes % CACHE_LINE_SIZE)) % CACHE_LINE_SIZE;
for (size_t i = 0; i < offset; i++) {
*dst_bytes++ = val;
len--;
}
value = vld1q_u8(&val); value = vld1q_u8(&val);
for (size_t i = 0; i < num_blocks; i++) { size_t num_lines = len / CACHE_LINE_SIZE;
uint8x16_t dst_data = vld1q_u8(dst_bytes);
dst_data = vorrq_u8(dst_data, value); for (size_t i = 0; i < num_lines; i++) {
for (size_t j = 0; j < CACHE_LINE_SIZE / 16; j++) {
vst1q_u8(dst_bytes, dst_data); uint8x16_t dst_data = vld1q_u8(dst_bytes);
dst_bytes += 16; dst_data = vorrq_u8(dst_data, value);
vst1q_u8(dst_bytes, dst_data);
dst_bytes += 16;
}
} }
size_t remaining_bytes = len % 16; size_t remaining_bytes = len % CACHE_LINE_SIZE;
for (size_t i = 0; i < remaining_bytes; i++) { for (size_t i = 0; i < remaining_bytes; i++) {
*dst_bytes++ = val; *dst_bytes++ = val;
} }
} }
void memset_asimd(void *dst, uint8_t val, size_t len) { void memset_asimd(void *dst, uint8_t val, size_t len) {
uint8x16_t value = vdupq_n_u8(val); uint8x16_t value = vdupq_n_u8(val);
uint8_t *dst_bytes = (uint8_t *)dst; uint8_t *dst_bytes = (uint8_t *)dst;

View File

@@ -33,7 +33,7 @@ extern void yuyv_plane_init();
extern void benchmark_tasks(int n_tasks, long n_frames, int w, int h); extern void benchmark_tasks(int n_tasks, long n_frames, int w, int h);
extern void init_parallel_tasks(int n_tasks); extern void init_parallel_tasks(int n_tasks);
static int MEM_ALIGNMENT_SIZE = 0; static int MEM_ALIGNMENT_SIZE = 0;
static int CACHE_LINE_SIZE = 64; extern int CACHE_LINE_SIZE = 64;
static int has_cpuid(void) static int has_cpuid(void)

View File

@@ -65,4 +65,5 @@ extern void *vj_simple_pool_alloc( void *ptr, size_t s );
extern void *vj_simple_pool_init( size_t s ); extern void *vj_simple_pool_init( size_t s );
extern void vj_simple_pool_reset( void *ptr ); extern void vj_simple_pool_reset( void *ptr );
extern void vj_mem_destroy(); extern void vj_mem_destroy();
#endif #endif