diff --git a/veejay-current/veejay-client/src/gveejay.c b/veejay-current/veejay-client/src/gveejay.c index 89ea2741..15522572 100644 --- a/veejay-current/veejay-client/src/gveejay.c +++ b/veejay-current/veejay-client/src/gveejay.c @@ -311,7 +311,7 @@ int main(int argc, char **argv) } g_option_context_free(context); - vj_mem_init(); + vj_mem_init(0,0); vevo_strict_init(); status = g_application_run (G_APPLICATION (app), argc, argv); diff --git a/veejay-current/veejay-core/configure.ac b/veejay-current/veejay-core/configure.ac index 6bd12fc0..bfe0bed1 100644 --- a/veejay-current/veejay-core/configure.ac +++ b/veejay-current/veejay-core/configure.ac @@ -222,6 +222,10 @@ have_asm_mmx=false have_asm_sse=false have_asm_sse2=false have_asm_mmx2=false +have_asm_avx=false +have_asm_avx2=false +have_asm_avx512=false +have_asm_erms=false have_asm_3dnow=false have_cmov=false have_x86cpu=false @@ -233,6 +237,7 @@ have_ps2=false have_arm=false have_armv7a=false + OP_CFLAGS="" LZO_EXTRA_CFLAGS="-DMINILZO_HAVE_CONFIG_H" @@ -396,24 +401,24 @@ then ac_cv_flag_mmx=yes ac_cv_flag_sse=yes ac_cv_flag_sse4_2=yes - ac_cv_flag_sse4_1=yes - ac_cv_flag_sse2=yes + ac_cv_flag_sse4_1=yes + ac_cv_flag_sse2=yes ac_cv_flag_cmov=yes AC_DEFINE(HAVE_ASM_MMX,1,[Compiling in MMX support]) - AC_DEFINE(HAVE_MMX,1,[Compiling in MMX support]) + AC_DEFINE(HAVE_MMX,1,[Compiling in MMX support]) have_asm_mmx=true - AC_DEFINE(HAVE_ASM_SSE,1,[Compiling in SSE support]) - AC_DEFINE(HAVE_SSE,1,[Compiling in SSE support]) - have_asm_sse=true + AC_DEFINE(HAVE_ASM_SSE,1,[Compiling in SSE support]) + AC_DEFINE(HAVE_SSE,1,[Compiling in SSE support]) + have_asm_sse=true AC_DEFINE(HAVE_ASM_SSE2,1,[Compiling in SSE2 support]) - AC_DEFINE(HAVE_SSE2,1,[Compiling in SSE2 support]) - have_asm_sse2=true + AC_DEFINE(HAVE_SSE2,1,[Compiling in SSE2 support]) + have_asm_sse2=true AC_DEFINE(HAVE_CMOV,1,[Compiling in CMOV]) - have_cmov=true + have_cmov=true fi if test x$have_linux = xtrue @@ -556,7 +561,32 @@ then AC_DEFINE(HAVE_ASM_AVX,1,[Compiling in AVX]) have_asm_avx=true fi - + + dnl check for AVX2 + AC_CACHE_CHECK(for AVX2 on processor(s), ac_cv_flag_avx2, [ + if grep "^flags.* avx2" /proc/cpuinfo > /dev/null; then + ac_cv_flag_avx2=yes + else + ac_cv_flag_avx2=no + fi + ]) + if test $ac_cv_flag_avx2 = yes; then + AC_DEFINE(HAVE_ASM_AVX2,1,[Compiling in AVX2]) + have_asm_avx2=true + fi + + dnl check for AVX512 + AC_CACHE_CHECK(for AVX512 on processor(s), ac_cv_flag_avx512, [ + if grep "^flags.* avx512" /proc/cpuinfo > /dev/null; then + ac_cv_flag_avx512=yes + else + ac_cv_flag_avx512=no + fi + ]) + if test $ac_cv_flag_avx512 = yes; then + AC_DEFINE(HAVE_ASM_AVX512,1,[Compiling in AVX512]) + have_asm_avx512=true + fi fi fi @@ -702,6 +732,9 @@ fi dnl ********************************************************************** dnl All the conditional stuff for the Makefiles AM_CONDITIONAL(HAVE_ASM_MMX, test x$have_asm_mmx = xtrue) +AM_CONDITIONAL(HAVE_ASM_AVX, test x$have_asm_avx = xtrue ) +AM_CONDITIONAL(HAVE_ASM_AVX2,test x$have_asm_avx2 = xtrue ) +AM_CONDITIONAL(HAVE_ASM_AVX512,test x$have_asm_avx512 = xtrue ) AM_CONDITIONAL(HAVE_X86CPU, test x$have_x86cpu = xtrue) AM_CONDITIONAL(HAVE_PPCCPU, test x$have_ppccpu = xtrue) AM_CONDITIONAL(ARCH_PPC, test x$have_ppccpu = xtrue) @@ -841,6 +874,8 @@ AC_MSG_NOTICE([ SSE2 enabled : ${ac_cv_flag_sse2}]) AC_MSG_NOTICE([ 3DNOW enabled : ${ac_cv_flag_3dnow}]) AC_MSG_NOTICE([ CMOV enabled : ${ac_cv_flag_cmov}]) AC_MSG_NOTICE([ AVX enabled : ${ac_cv_flag_avx}]) +AC_MSG_NOTICE([ AVX2 enabled : ${ac_cv_flag_avx2}]) +AC_MSG_NOTICE([ AVX-512 enabled : ${ac_cv_flag_avx512}]) fi if test "$have_ppccpu" = "true" ; then diff --git a/veejay-current/veejay-core/libvjmem/memcpy.c b/veejay-current/veejay-core/libvjmem/memcpy.c index 363ff834..1bebcf70 100644 --- a/veejay-current/veejay-core/libvjmem/memcpy.c +++ b/veejay-current/veejay-core/libvjmem/memcpy.c @@ -172,7 +172,7 @@ #define CONFUSION_FACTOR 0 //Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) -#define FIND_BEST_MAX_ITERATIONS 100 +#define FIND_BEST_MAX_ITERATIONS 1024 #define is_aligned__(PTR,LEN) \ @@ -340,6 +340,8 @@ static __inline__ void * __memcpy(void * to, const void * from, size_t n) //#endif +static int BENCHMARK_WID = 1920; +static int BENCHMARK_HEI = 1080; char *veejay_strncpy( char *dest, const char *src, size_t n ) { @@ -855,7 +857,7 @@ static void *sse_memcpy2(void * to, const void * from, size_t len) register uintptr_t delta; /* Align destination to SSE_MMREG_SIZE -boundary */ - delta = ((uintptr_t)to)&(SSE_MMREG_SIZE-1); + delta = ((uintptr_t)to)&(SSE_MMREG_SIZE-1); if(delta) { delta=SSE_MMREG_SIZE-delta; @@ -1003,28 +1005,249 @@ static void * sse_memcpy(void * to, const void * from, size_t len) } #endif +#ifdef HAVE_ASM_AVX2 +void *avx2_memcpy(void *to, const void *from, size_t len) { + void *retval = to; + if (len >= 128) { + uintptr_t delta = ((uintptr_t)to) & 31; + if (delta) { + delta = 32 - delta; + len -= delta; + __builtin_memcpy(to, from, delta); + } + size_t blocks = len / 128; + len %= 128; + for (size_t i = 0; i < blocks; i++) { + _mm_prefetch((const char *)from + 320, _MM_HINT_NTA); + _mm_prefetch((const char *)from + 352, _MM_HINT_NTA); + __m256i ymm0 = _mm256_loadu_si256((__m256i *)from); + __m256i ymm1 = _mm256_loadu_si256((__m256i *)(from + 32)); + __m256i ymm2 = _mm256_loadu_si256((__m256i *)(from + 64)); + __m256i ymm3 = _mm256_loadu_si256((__m256i *)(from + 96)); + _mm256_stream_si256((__m256i *)to, ymm0); + _mm256_stream_si256((__m256i *)(to + 32), ymm1); + _mm256_stream_si256((__m256i *)(to + 64), ymm2); + _mm256_stream_si256((__m256i *)(to + 96), ymm3); + from = (const void *)((const char *)from + 128); + to = (void *)((char *)to + 128); + } + _mm_sfence(); + } + + if (len) { + __builtin_memcpy(to, from, len); + } + + return retval; +} +#endif + +#ifdef HAVE_ASM_AVX512 +#define AVX512_MMREG_SIZE 64 // 512-bit register = 64 bytes +static void * avx512_memcpy(void *to, const void *from, size_t len) { + void *retval = to; + size_t i; + __asm__ __volatile__ ( + " prefetchnta (%0)\n" + " prefetchnta 64(%0)\n" + " prefetchnta 128(%0)\n" + " prefetchnta 192(%0)\n" + " prefetchnta 256(%0)\n" + " prefetchnta 320(%0)\n" + " prefetchnta 384(%0)\n" + " prefetchnta 448(%0)\n" + " prefetchnta 512(%0)\n" + " prefetchnta 576(%0)\n" + " prefetchnta 640(%0)\n" + :: "r" (from)); + + if (len >= 512) { + register uintptr_t delta; + delta = ((uintptr_t)to) & (AVX512_MMREG_SIZE - 1); + if (delta) { + delta = AVX512_MMREG_SIZE - delta; + len -= delta; + memcpy(to, from, delta); + from = (char *)from + delta; + to = (char *)to + delta; + } + + i = len >> 8; // len / 256 + len &= 255; + if (((uintptr_t)from) & 63) { + for (; i > 0; i--) { + __asm__ __volatile__ ( + "vmovups (%0), %%ymm0\n" + "vmovups 32(%0), %%ymm1\n" + "vmovups 64(%0), %%ymm2\n" + "vmovups 96(%0), %%ymm3\n" + "vmovups 128(%0), %%ymm4\n" + "vmovups 160(%0), %%ymm5\n" + "vmovups 192(%0), %%ymm6\n" + "vmovups 224(%0), %%ymm7\n" + "vmovntps %%ymm0, (%1)\n" + "vmovntps %%ymm1, 32(%1)\n" + "vmovntps %%ymm2, 64(%1)\n" + "vmovntps %%ymm3, 96(%1)\n" + "vmovntps %%ymm4, 128(%1)\n" + "vmovntps %%ymm5, 160(%1)\n" + "vmovntps %%ymm6, 192(%1)\n" + "vmovntps %%ymm7, 224(%1)\n" + :: "r" (from), "r" (to) : "memory"); + from = ((const unsigned char *)from) + 256; + to = ((unsigned char *)to) + 256; + } + } else { + for (; i > 0; i--) { + __asm__ __volatile__ ( + "vmovaps (%0), %%ymm0\n" + "vmovaps 32(%0), %%ymm1\n" + "vmovaps 64(%0), %%ymm2\n" + "vmovaps 96(%0), %%ymm3\n" + "vmovaps 128(%0), %%ymm4\n" + "vmovaps 160(%0), %%ymm5\n" + "vmovaps 192(%0), %%ymm6\n" + "vmovaps 224(%0), %%ymm7\n" + "vmovntps %%ymm0, (%1)\n" + "vmovntps %%ymm1, 32(%1)\n" + "vmovntps %%ymm2, 64(%1)\n" + "vmovntps %%ymm3, 96(%1)\n" + "vmovntps %%ymm4, 128(%1)\n" + "vmovntps %%ymm5, 160(%1)\n" + "vmovntps %%ymm6, 192(%1)\n" + "vmovntps %%ymm7, 224(%1)\n" + :: "r" (from), "r" (to) : "memory"); + from = ((const unsigned char *)from) + 256; + to = ((unsigned char *)to) + 256; + } + } + __asm__ __volatile__ ("sfence" ::: "memory"); + } + + if (len) memcpy(to, from, len); + + return retval; +} +#endif + #ifdef HAVE_ASM_AVX -static void * avx_memcpy(void * to, const void * from, size_t len) + +static void* avx_memcpy(void *destination, const void *source, size_t size) +{ + unsigned char *dst = (unsigned char*)destination; + const unsigned char *src = (const unsigned char*)source; + static size_t cachesize = 0x200000; // L3-cache size + size_t padding; + + if (size <= 256) { + __memcpy(dst, src, size); + _mm256_zeroupper(); + return destination; + } + + // align destination to 16 bytes boundary + padding = (32 - (((size_t)dst) & 31)) & 31; + __m256i head = _mm256_loadu_si256((const __m256i*)src); + _mm256_storeu_si256((__m256i*)dst, head); + dst += padding; + src += padding; + size -= padding; + + // medium size copy + if (size <= cachesize) { + __m256i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 256; size -= 256) { + c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); + c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); + c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); + c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); + c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); + c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); + c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); + c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); + _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); + src += 256; + _mm256_storeu_si256((((__m256i*)dst) + 0), c0); + _mm256_storeu_si256((((__m256i*)dst) + 1), c1); + _mm256_storeu_si256((((__m256i*)dst) + 2), c2); + _mm256_storeu_si256((((__m256i*)dst) + 3), c3); + _mm256_storeu_si256((((__m256i*)dst) + 4), c4); + _mm256_storeu_si256((((__m256i*)dst) + 5), c5); + _mm256_storeu_si256((((__m256i*)dst) + 6), c6); + _mm256_storeu_si256((((__m256i*)dst) + 7), c7); + dst += 256; + } + } + else { // big memory copy + __m256i c0, c1, c2, c3, c4, c5, c6, c7; + /* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */ + + _mm_prefetch((const char*)(src), _MM_HINT_NTA); + + if ((((size_t)src) & 31) == 0) { // source aligned + for (; size >= 256; size -= 256) { + c0 = _mm256_load_si256(((const __m256i*)src) + 0); + c1 = _mm256_load_si256(((const __m256i*)src) + 1); + c2 = _mm256_load_si256(((const __m256i*)src) + 2); + c3 = _mm256_load_si256(((const __m256i*)src) + 3); + c4 = _mm256_load_si256(((const __m256i*)src) + 4); + c5 = _mm256_load_si256(((const __m256i*)src) + 5); + c6 = _mm256_load_si256(((const __m256i*)src) + 6); + c7 = _mm256_load_si256(((const __m256i*)src) + 7); + _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); + src += 256; + _mm256_stream_si256((((__m256i*)dst) + 0), c0); + _mm256_stream_si256((((__m256i*)dst) + 1), c1); + _mm256_stream_si256((((__m256i*)dst) + 2), c2); + _mm256_stream_si256((((__m256i*)dst) + 3), c3); + _mm256_stream_si256((((__m256i*)dst) + 4), c4); + _mm256_stream_si256((((__m256i*)dst) + 5), c5); + _mm256_stream_si256((((__m256i*)dst) + 6), c6); + _mm256_stream_si256((((__m256i*)dst) + 7), c7); + dst += 256; + } + } + else { // source unaligned + for (; size >= 256; size -= 256) { + c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); + c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); + c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); + c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); + c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); + c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); + c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); + c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); + _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); + src += 256; + _mm256_stream_si256((((__m256i*)dst) + 0), c0); + _mm256_stream_si256((((__m256i*)dst) + 1), c1); + _mm256_stream_si256((((__m256i*)dst) + 2), c2); + _mm256_stream_si256((((__m256i*)dst) + 3), c3); + _mm256_stream_si256((((__m256i*)dst) + 4), c4); + _mm256_stream_si256((((__m256i*)dst) + 5), c5); + _mm256_stream_si256((((__m256i*)dst) + 6), c6); + _mm256_stream_si256((((__m256i*)dst) + 7), c7); + dst += 256; + } + } + _mm_sfence(); + } + + __memcpy(dst, src, size); + _mm256_zeroupper(); + + return destination; +} + + +static void * avx_memcpy2(void * to, const void * from, size_t len) { void *retval; size_t i; retval = to; - /* PREFETCH has effect even for MOVSB instruction ;) */ - __asm__ __volatile__ ( - " prefetchnta (%0)\n" - " prefetchnta 32(%0)\n" - " prefetchnta 64(%0)\n" - " prefetchnta 96(%0)\n" - " prefetchnta 128(%0)\n" - " prefetchnta 160(%0)\n" - " prefetchnta 192(%0)\n" - " prefetchnta 224(%0)\n" - " prefetchnta 256(%0)\n" - " prefetchnta 288(%0)\n" - : : "r" (from) ); - - if(len >= MIN_LEN) + if(len >= 256) { register uintptr_t delta; /* Align destinition to MMREG_SIZE -boundary */ @@ -1035,53 +1258,64 @@ static void * avx_memcpy(void * to, const void * from, size_t len) len -= delta; small_memcpy(to, from, delta); } - i = len >> 7; /* len/128 */ - len&=127; + i = len >> 8; + len&=255; + + __asm__ __volatile__ ( + "prefetchnta 64(%0)\n" + "prefetchnta 128(%0)\n" + "prefetchnta 192(%0)\n" + "prefetchnta 256(%0)\n" + : : "r" (from) + ); + if(((uintptr_t)from) & 31) - /* if SRC is misaligned */ for(; i>0; i--) { __asm__ __volatile__ ( - "prefetchnta 320(%0)\n" - "prefetchnta 352(%0)\n" - "prefetchnta 384(%0)\n" - "prefetchnta 416(%0)\n" "vmovups (%0), %%ymm0\n" "vmovups 32(%0), %%ymm1\n" "vmovups 64(%0), %%ymm2\n" "vmovups 96(%0), %%ymm3\n" + "vmovups 128(%0), %%ymm4\n" + "vmovups 160(%0), %%ymm5\n" + "vmovups 192(%0), %%ymm6\n" + "vmovups 224(%0), %%ymm7\n" "vmovntps %%ymm0, (%1)\n" "vmovntps %%ymm1, 32(%1)\n" "vmovntps %%ymm2, 64(%1)\n" "vmovntps %%ymm3, 96(%1)\n" + "vmovntps %%ymm4, 128(%1)\n" + "vmovntps %%ymm5, 160(%1)\n" + "vmovntps %%ymm6, 192(%1)\n" + "vmovntps %%ymm7, 224(%1)\n" :: "r" (from), "r" (to) : "memory"); - from = ((const unsigned char *)from) + 128; - to = ((unsigned char *)to) + 128; + from = ((const unsigned char *)from) + 256; + to = ((unsigned char *)to) + 256; } else - /* - Only if SRC is aligned on 16-byte boundary. - It allows to use movaps instead of movups, which required data - to be aligned or a general-protection exception (#GP) is generated. - */ for(; i>0; i--) { __asm__ __volatile__ ( - "prefetchnta 320(%0)\n" - "prefetchnta 352(%0)\n" - "prefetchnta 384(%0)\n" - "prefetchnta 416(%0)\n" "vmovaps (%0), %%ymm0\n" "vmovaps 32(%0), %%ymm1\n" "vmovaps 64(%0), %%ymm2\n" "vmovaps 96(%0), %%ymm3\n" + "vmovaps 128(%0), %%ymm4\n" + "vmovaps 160(%0), %%ymm5\n" + "vmovaps 192(%0), %%ymm6\n" + "vmovaps 224(%0), %%ymm7\n" "vmovntps %%ymm0, (%1)\n" "vmovntps %%ymm1, 32(%1)\n" "vmovntps %%ymm2, 64(%1)\n" "vmovntps %%ymm3, 96(%1)\n" + "vmovntps %%ymm4, 128(%1)\n" + "vmovntps %%ymm5, 160(%1)\n" + "vmovntps %%ymm6, 192(%1)\n" + "vmovntps %%ymm7, 224(%1)\n" :: "r" (from), "r" (to) : "memory"); - from = ((const unsigned char *)from) + 128; - to = ((unsigned char *)to) + 128; + from = ((const unsigned char *)from) + 256; + to = ((unsigned char *)to) + 256; } /* since movntq is weakly-ordered, a "sfence" * is needed to become ordered again. */ @@ -1230,11 +1464,11 @@ static void *fast_memcpy(void * to, const void * from, size_t len) #ifndef HAVE_ONLY_MMX1 /* PREFETCH has effect even for MOVSB instruction ;) */ __asm__ volatile ( - PREFETCH" (%0)\n" - PREFETCH" 64(%0)\n" - PREFETCH" 128(%0)\n" - PREFETCH" 192(%0)\n" - PREFETCH" 256(%0)\n" + PREFETCH" (%0)\n" + PREFETCH" 64(%0)\n" + PREFETCH" 128(%0)\n" + PREFETCH" 192(%0)\n" + PREFETCH" 256(%0)\n" : : "r" (from) ); #endif @@ -1249,7 +1483,7 @@ static void *fast_memcpy(void * to, const void * from, size_t len) len -= delta; small_memcpy(to, from, delta); } - i = len >> 6; /* len/64 */ + i = len >> 6; /* len/64 */ len&=63; /* This algorithm is top effective when the code consequently @@ -1337,10 +1571,10 @@ static void *fast_memcpy(void * to, const void * from, size_t len) "xor %%"REG_a", %%"REG_a" \n\t" ASMALIGN(4) "1: \n\t" - "movl (%0, %%"REG_a"), %%ecx \n\t" - "movl 32(%0, %%"REG_a"), %%ecx \n\t" - "movl 64(%0, %%"REG_a"), %%ecx \n\t" - "movl 96(%0, %%"REG_a"), %%ecx \n\t" + "movl (%0, %%"REG_a"), %%ecx \n\t" + "movl 32(%0, %%"REG_a"), %%ecx \n\t" + "movl 64(%0, %%"REG_a"), %%ecx \n\t" + "movl 96(%0, %%"REG_a"), %%ecx \n\t" "add $128, %%"REG_a" \n\t" "cmp %3, %%"REG_a" \n\t" " jb 1b \n\t" @@ -1393,7 +1627,7 @@ static void *fast_memcpy(void * to, const void * from, size_t len) { __asm__ volatile ( #ifndef HAVE_ONLY_MMX1 - PREFETCH" 320(%0)\n" + PREFETCH" 320(%0)\n" #endif "movq (%0), %%mm0\n" "movq 8(%0), %%mm1\n" @@ -1614,9 +1848,9 @@ static inline void memcpy_neon_256( uint8_t *dst, const uint8_t *src ) : [src] "+r" (src), [dst] "+r" (dst) :: "memory" , "d0", "d1", "d2", "d3", "d4", "d5", "d6" , "d7", - "d8", "d9", "d10","d11","d12","d13","d14", "d15", - "d16","d17","d18","d19","d20","d21","d22", "d23", - "d24","d23","d24","d25","d26","d27","d28", "d29", + "d8", "d9", "d10","d11","d12","d13","d14", "d15", + "d16","d17","d18","d19","d20","d21","d22", "d23", + "d24","d23","d24","d25","d26","d27","d28", "d29", "d30","d31" ); @@ -1911,7 +2145,7 @@ void memset_asimd_v2(void *dst, uint8_t val, size_t len) { } void memset_asimd_v4(void *dst, uint8_t val, size_t len) { if( len == 0 || NULL == dst ) - return; + return; uint8x16_t v = vdupq_n_u8(val); size_t multiple_of_16 = len & ~0xF; @@ -1931,7 +2165,7 @@ void memset_asimd_64(uint8_t *dst, uint8_t value, size_t size) { if( size == 0 || NULL == dst ) return; - uint8x16_t value_v = vdupq_n_u8(value); + uint8x16_t value_v = vdupq_n_u8(value); size_t num_blocks = size / 64; size_t remaining_bytes = size % 64; @@ -1972,7 +2206,7 @@ void memset_asimd_32(uint8_t *dst, uint8_t value, size_t size) { if( size == 0 || dst == NULL ) return; - uint8x16_t value_v = vdupq_n_u8(value); + uint8x16_t value_v = vdupq_n_u8(value); size_t num_blocks = size / 32; size_t remaining_bytes = size % 32; @@ -2050,6 +2284,13 @@ static struct { #endif #ifdef HAVE_ASM_AVX { "AVX optimized memcpy()", (void*) avx_memcpy, 0,AV_CPU_FLAG_AVX }, + { "AVX simple memcpy()", (void*) avx_memcpy2, 0, AV_CPU_FLAG_AVX }, +#endif +#ifdef HAVE_ASM_AVX2 + { "AVX2 optimized memcpy()", (void*) avx2_memcpy, 0, AV_CPU_FLAG_AVX2 }, +#endif +#ifdef HAVE_ASM_AVX512 + { "AVX-512 optimized memcpy()", (void*) avx512_memcpy, 0, AV_CPU_FLAG_AVX512 }, #endif #ifdef HAVE_ASM_MMX { "MMX optimized memcpy()", (void*) mmx_memcpy, 0,AV_CPU_FLAG_MMX }, @@ -2131,14 +2372,14 @@ static struct { { "memset align 32 (C) Harm Hanemaaijer ", (void*) memset_new_align_32,0,0 }, #endif #ifdef HAVE_ASM_SSE4_1 - { "SSE4_1 memset()", (void*) sse41_memset,0, AV_CPU_FLAG_SSE4}, + { "SSE4_1 memset()", (void*) sse41_memset,0, AV_CPU_FLAG_SSE4}, { "SSE4_1 memset() v2", (void*) sse41_memset_v2,0, AV_CPU_FLAG_SSE4}, #endif #ifdef HAVE_ASM_SSE4_2 { "SSE4_2 unaligned memset()", (void*) sse42_memset,0, AV_CPU_FLAG_SSE42}, { "SSE4_2 aligned memset()", (void*) sse42_aligned_memset, 0, AV_CPU_FLAG_SSE42 }, #endif - { "64-bit word memset()", (void*) memset_64, 0, 0}, + { "64-bit word memset()", (void*) memset_64, 0, 0}, { NULL, NULL, 0, 0}, }; @@ -2176,11 +2417,7 @@ static int set_user_selected_memcpy() return i; } } - veejay_msg(VEEJAY_MSG_ERROR, "No valid memcpy method selected, please use one of the following:"); - for( i = 1; memcpy_method[i].name; i ++ ) { - veejay_msg(VEEJAY_MSG_ERROR, "\t\"%s\"", memcpy_method[i].name); - } - veejay_msg(VEEJAY_MSG_ERROR, "Using memcpy method '%s'", memcpy_method[1].name ); + veejay_msg(VEEJAY_MSG_INFO, "Using memcpy method '%s'", memcpy_method[1].name ); } return 0; } @@ -2196,10 +2433,6 @@ static int set_user_selected_memset() return i; } } - veejay_msg(VEEJAY_MSG_ERROR, "No valid memset method selected, please use one of the following:"); - for( i = 1; memset_method[i].name; i ++ ) { - veejay_msg(VEEJAY_MSG_ERROR, "\t\"%s\"", memset_method[i].name); - } veejay_msg(VEEJAY_MSG_ERROR, "Using memset method '%s'", memset_method[1].name ); } return 0; @@ -2208,7 +2441,7 @@ static int set_user_selected_memset() static void mem_fill_block(uint8_t *dst, size_t len) { int i; for( i = 0; i < len ; i ++ ) { - dst[i] = i % 255; + dst[i] = i % 256; } } @@ -2235,17 +2468,17 @@ void find_best_memcpy() double t; uint8_t *buf1, *buf2, *validbuf; int i, k; - int bufsize = 10 * 1048576; + int bufsize = (BENCHMARK_WID * BENCHMARK_HEI * 3); - if (!(buf1 = (uint8_t*) malloc( bufsize * sizeof(uint8_t)))) + if (!(buf1 = (uint8_t*) vj_malloc( bufsize * sizeof(uint8_t)))) return; - if (!(buf2 = (uint8_t*) malloc( bufsize * sizeof(uint8_t)))) { + if (!(buf2 = (uint8_t*) vj_malloc( bufsize * sizeof(uint8_t)))) { free( buf1 ); return; } - if (!(validbuf = (uint8_t*) malloc( bufsize * sizeof(uint8_t)))) { + if (!(validbuf = (uint8_t*) vj_malloc( bufsize * sizeof(uint8_t)))) { free( buf1 ); free( buf2 ); return; @@ -2267,7 +2500,7 @@ void find_best_memcpy() memset(buf2, 0, bufsize); mem_fill_block(validbuf, bufsize); - for( i = 1; memcpy_method[i].name; i ++ ) { + for( i = 1; memcpy_method[i].name != NULL; i ++ ) { if( memcpy_method[i].cpu_require && !(cpu_flags & memcpy_method[i].cpu_require ) ) { memcpy_method[i].t = 0.0; @@ -2284,13 +2517,18 @@ void find_best_memcpy() t = 0; } + if( t > 0 ) + veejay_msg(VEEJAY_MSG_INFO, "method '%s' completed in %g seconds", memcpy_method[i].name, t ); + else + veejay_msg(VEEJAY_MSG_WARNING, "method '%s' fails validation"); + memcpy_method[i].t = t; } - for( i = 1; memcpy_method[i].name; i ++ ) { + for( i = 1; memcpy_method[i].name != NULL; i ++ ) { if(best == 0 ) { best = i; - t = memcpy_method[i].t; + t = memcpy_method[i].t; continue; } @@ -2314,6 +2552,7 @@ set_best_memcpy_method: selected_best_memcpy = best; veejay_msg(VEEJAY_MSG_INFO, "Selected %s", memcpy_method[best].name); + veejay_msg(VEEJAY_MSG_WARNING, "export VEEJAY_MEMCPY_METHOD=\"%s\"", memcpy_method[best].name ); } void find_best_memset() @@ -2326,13 +2565,13 @@ void find_best_memset() double t; char *buf1, *buf2; int i, k; - int bufsize = 10 * 1048576; + int bufsize = (BENCHMARK_WID * BENCHMARK_HEI * 3); int cpu_flags = av_get_cpu_flags(); - if (!(buf1 = (char*) malloc( bufsize * sizeof(char) ))) - return; + if (!(buf1 = (char*) vj_malloc( bufsize * sizeof(char) ))) + return; - if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) { + if (!(buf2 = (char*) vj_malloc( bufsize * sizeof(char) ))) { free( buf1 ); return; } @@ -2342,7 +2581,7 @@ void find_best_memset() memset( buf1, 0, bufsize * sizeof(char)); memset( buf2, 0, bufsize * sizeof(char)); - for (i=1; memset_method[i].name; i++) + for (i=1; memset_method[i].name != NULL; i++) { if( memset_method[i].cpu_require && !(cpu_flags & memset_method[i].cpu_require ) ) { memset_method[i].t= 0; @@ -2354,7 +2593,9 @@ void find_best_memset() memset_method[i].function( buf1 , 0 , bufsize ); } t = get_time() - t; - + + veejay_msg(VEEJAY_MSG_INFO, "method '%s' completed in %g seconds", memset_method[i].name, t ); + memset_method[i].t = t; if (best == 0 || t < memset_method[best].t) @@ -2374,19 +2615,25 @@ set_best_memset_method: selected_best_memset = best; veejay_msg(VEEJAY_MSG_INFO, "Selected %s", memset_method[best].name); + veejay_msg(VEEJAY_MSG_WARNING, "export VEEJAY_MEMSET_METHOD=\"%s\"", memset_method[best].name ); + } -void vj_mem_set_defaults() { -#ifdef STRICT_CHECKING - veejay_memset = memset; - veejay_memcpy = memcpy; - veejay_msg(VEEJAY_MSG_WARNING, "Using default memcpy() / memset() functions"); -#else +void vj_mem_set_defaults(int w, int h) { + + if( w > 0 ) + BENCHMARK_WID = w; + if( h > 0 ) + BENCHMARK_HEI = h; + veejay_memset = memset_method[1].function; veejay_memcpy = memcpy_method[1].function; -#endif + + set_user_selected_memcpy(); + set_user_selected_memset(); } + static void vj_frame_copy_job( void *arg ) { int i; vj_task_arg_t *info = (vj_task_arg_t*) arg; @@ -2458,8 +2705,8 @@ void vj_frame_slow_threaded( uint8_t **p0_buffer, uint8_t **p1_buffer, uint8_t * #ifdef HAVE_ASM_MMX __asm __volatile(_EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); + SFENCE" \n\t" + :::"memory"); #endif /* @@ -2635,30 +2882,34 @@ static void run_benchmark_test(int n_tasks, benchmark_func f, const char *str, i double stats[N]; uint32_t i; double fastest = 0.0; + double slowest = 0.0; float work_size = (planes[0] + planes[1] + planes[2] + planes[3]) / 1048576.0f; - veejay_msg(VEEJAY_MSG_INFO, "run test '%s' (%dx) on chunks of %2.2f MB:", str, N, work_size ); + veejay_msg(VEEJAY_MSG_INFO, "run %dx test '%s' on chunks of %2.2f MB:", N, str, work_size ); for( i = 0; i < N; i ++ ) { stats[i] = f( n_frames, n_tasks, source, dest, planes ); - if( stats[i] > fastest ) + if(i == 0 || stats[i] < fastest ) fastest = stats[i]; + + if( stats[i] > slowest ) + slowest = stats[i]; } double sum = 0.0; - double slowest=fastest; for( i = 0; i < N; i ++ ) { - if( stats[i] < fastest ) { - fastest = stats[i]; - } sum += stats[i]; } double average = (sum / N); - veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %g, worst is %g, average is %g",str, fastest, slowest, average ); + double fastest_ms = fastest * 1000000.0; + double slowest_ms = slowest * 1000000.0; + double average_ms = average * 1000000.0; + + veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %gms, worst is %gms, average is %gms",str, fastest_ms, slowest_ms, average_ms ); } static void benchmark_tasks(unsigned int n_tasks, long n_frames, int w, int h) @@ -2708,6 +2959,10 @@ void benchmark_veejay(int w, int h) veejay_msg(VEEJAY_MSG_INFO, "Starting benchmark %dx%d YUVP 4:2:2 (100 frames)", w,h); + find_best_memcpy(); + + find_best_memset(); + init_parallel_tasks( 0 ); benchmark_tasks( 0,100,w,h ); diff --git a/veejay-current/veejay-core/libvjmem/vj-x86.c b/veejay-current/veejay-core/libvjmem/vj-x86.c index a08a0bb3..0ec55ede 100644 --- a/veejay-current/veejay-core/libvjmem/vj-x86.c +++ b/veejay-current/veejay-core/libvjmem/vj-x86.c @@ -146,7 +146,7 @@ int mem_align_size() return MEM_ALIGNMENT_SIZE; } -void vj_mem_init(void) +void vj_mem_init(int w, int h) { #if defined(ARCH_X86) || defined(ARCH_X86_X64) || defined(HAVE_ARM) CACHE_LINE_SIZE = get_cache_line_size(); @@ -158,7 +158,7 @@ void vj_mem_init(void) #endif //find_best_memcpy(); //find_best_memset(); - vj_mem_set_defaults(); + vj_mem_set_defaults(w,h); } void vj_mem_optimize() { diff --git a/veejay-current/veejay-core/libvjmem/vjmem.h b/veejay-current/veejay-core/libvjmem/vjmem.h index 362bdfcb..ad9301a2 100644 --- a/veejay-current/veejay-core/libvjmem/vjmem.h +++ b/veejay-current/veejay-core/libvjmem/vjmem.h @@ -24,9 +24,9 @@ extern void *(* veejay_memcpy)(void *to, const void *from, size_t len); extern void *(* veejay_memset)(void *to, uint8_t val, size_t len); -extern void vj_mem_init(void); +extern void vj_mem_init(int w, int h); extern void vj_mem_optimize(); -extern void vj_mem_set_defaults(); +extern void vj_mem_set_defaults(int w, int h); extern int vj_mem_threaded_init(int w, int h); extern void *vj_malloc_(size_t size); extern void *vj_calloc_(size_t size ); diff --git a/veejay-current/veejay-core/veejay.arch b/veejay-current/veejay-core/veejay.arch index 86d65bc9..ea4c76d0 100644 --- a/veejay-current/veejay-core/veejay.arch +++ b/veejay-current/veejay-core/veejay.arch @@ -1 +1 @@ --march=silvermont +-march=alderlake diff --git a/veejay-current/veejay-utils/src/sayVIMS.c b/veejay-current/veejay-utils/src/sayVIMS.c index f6589755..c748e5a6 100644 --- a/veejay-current/veejay-utils/src/sayVIMS.c +++ b/veejay-current/veejay-utils/src/sayVIMS.c @@ -511,7 +511,7 @@ int main(int argc, char *argv[]) return -1; } - vj_mem_init(); + vj_mem_init(0,0); reconnect();