mirror of
https://github.com/game-stop/veejay.git
synced 2025-12-11 18:35:00 +01:00
benchmark (fast?) memcpy/memset function when --benchmark commandline option is given, allow user to set VEEJAY_MEMSET_METHOD and VEEJAY_MEMCPY_METHOD based on preference
This commit is contained in:
@@ -311,7 +311,7 @@ int main(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
g_option_context_free(context);
|
g_option_context_free(context);
|
||||||
|
|
||||||
vj_mem_init();
|
vj_mem_init(0,0);
|
||||||
vevo_strict_init();
|
vevo_strict_init();
|
||||||
|
|
||||||
status = g_application_run (G_APPLICATION (app), argc, argv);
|
status = g_application_run (G_APPLICATION (app), argc, argv);
|
||||||
|
|||||||
@@ -222,6 +222,10 @@ have_asm_mmx=false
|
|||||||
have_asm_sse=false
|
have_asm_sse=false
|
||||||
have_asm_sse2=false
|
have_asm_sse2=false
|
||||||
have_asm_mmx2=false
|
have_asm_mmx2=false
|
||||||
|
have_asm_avx=false
|
||||||
|
have_asm_avx2=false
|
||||||
|
have_asm_avx512=false
|
||||||
|
have_asm_erms=false
|
||||||
have_asm_3dnow=false
|
have_asm_3dnow=false
|
||||||
have_cmov=false
|
have_cmov=false
|
||||||
have_x86cpu=false
|
have_x86cpu=false
|
||||||
@@ -233,6 +237,7 @@ have_ps2=false
|
|||||||
have_arm=false
|
have_arm=false
|
||||||
have_armv7a=false
|
have_armv7a=false
|
||||||
|
|
||||||
|
|
||||||
OP_CFLAGS=""
|
OP_CFLAGS=""
|
||||||
LZO_EXTRA_CFLAGS="-DMINILZO_HAVE_CONFIG_H"
|
LZO_EXTRA_CFLAGS="-DMINILZO_HAVE_CONFIG_H"
|
||||||
|
|
||||||
@@ -396,24 +401,24 @@ then
|
|||||||
ac_cv_flag_mmx=yes
|
ac_cv_flag_mmx=yes
|
||||||
ac_cv_flag_sse=yes
|
ac_cv_flag_sse=yes
|
||||||
ac_cv_flag_sse4_2=yes
|
ac_cv_flag_sse4_2=yes
|
||||||
ac_cv_flag_sse4_1=yes
|
ac_cv_flag_sse4_1=yes
|
||||||
ac_cv_flag_sse2=yes
|
ac_cv_flag_sse2=yes
|
||||||
ac_cv_flag_cmov=yes
|
ac_cv_flag_cmov=yes
|
||||||
|
|
||||||
AC_DEFINE(HAVE_ASM_MMX,1,[Compiling in MMX support])
|
AC_DEFINE(HAVE_ASM_MMX,1,[Compiling in MMX support])
|
||||||
AC_DEFINE(HAVE_MMX,1,[Compiling in MMX support])
|
AC_DEFINE(HAVE_MMX,1,[Compiling in MMX support])
|
||||||
have_asm_mmx=true
|
have_asm_mmx=true
|
||||||
|
|
||||||
AC_DEFINE(HAVE_ASM_SSE,1,[Compiling in SSE support])
|
AC_DEFINE(HAVE_ASM_SSE,1,[Compiling in SSE support])
|
||||||
AC_DEFINE(HAVE_SSE,1,[Compiling in SSE support])
|
AC_DEFINE(HAVE_SSE,1,[Compiling in SSE support])
|
||||||
have_asm_sse=true
|
have_asm_sse=true
|
||||||
|
|
||||||
AC_DEFINE(HAVE_ASM_SSE2,1,[Compiling in SSE2 support])
|
AC_DEFINE(HAVE_ASM_SSE2,1,[Compiling in SSE2 support])
|
||||||
AC_DEFINE(HAVE_SSE2,1,[Compiling in SSE2 support])
|
AC_DEFINE(HAVE_SSE2,1,[Compiling in SSE2 support])
|
||||||
have_asm_sse2=true
|
have_asm_sse2=true
|
||||||
|
|
||||||
AC_DEFINE(HAVE_CMOV,1,[Compiling in CMOV])
|
AC_DEFINE(HAVE_CMOV,1,[Compiling in CMOV])
|
||||||
have_cmov=true
|
have_cmov=true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if test x$have_linux = xtrue
|
if test x$have_linux = xtrue
|
||||||
@@ -556,7 +561,32 @@ then
|
|||||||
AC_DEFINE(HAVE_ASM_AVX,1,[Compiling in AVX])
|
AC_DEFINE(HAVE_ASM_AVX,1,[Compiling in AVX])
|
||||||
have_asm_avx=true
|
have_asm_avx=true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
dnl check for AVX2
|
||||||
|
AC_CACHE_CHECK(for AVX2 on processor(s), ac_cv_flag_avx2, [
|
||||||
|
if grep "^flags.* avx2" /proc/cpuinfo > /dev/null; then
|
||||||
|
ac_cv_flag_avx2=yes
|
||||||
|
else
|
||||||
|
ac_cv_flag_avx2=no
|
||||||
|
fi
|
||||||
|
])
|
||||||
|
if test $ac_cv_flag_avx2 = yes; then
|
||||||
|
AC_DEFINE(HAVE_ASM_AVX2,1,[Compiling in AVX2])
|
||||||
|
have_asm_avx2=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
dnl check for AVX512
|
||||||
|
AC_CACHE_CHECK(for AVX512 on processor(s), ac_cv_flag_avx512, [
|
||||||
|
if grep "^flags.* avx512" /proc/cpuinfo > /dev/null; then
|
||||||
|
ac_cv_flag_avx512=yes
|
||||||
|
else
|
||||||
|
ac_cv_flag_avx512=no
|
||||||
|
fi
|
||||||
|
])
|
||||||
|
if test $ac_cv_flag_avx512 = yes; then
|
||||||
|
AC_DEFINE(HAVE_ASM_AVX512,1,[Compiling in AVX512])
|
||||||
|
have_asm_avx512=true
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -702,6 +732,9 @@ fi
|
|||||||
dnl **********************************************************************
|
dnl **********************************************************************
|
||||||
dnl All the conditional stuff for the Makefiles
|
dnl All the conditional stuff for the Makefiles
|
||||||
AM_CONDITIONAL(HAVE_ASM_MMX, test x$have_asm_mmx = xtrue)
|
AM_CONDITIONAL(HAVE_ASM_MMX, test x$have_asm_mmx = xtrue)
|
||||||
|
AM_CONDITIONAL(HAVE_ASM_AVX, test x$have_asm_avx = xtrue )
|
||||||
|
AM_CONDITIONAL(HAVE_ASM_AVX2,test x$have_asm_avx2 = xtrue )
|
||||||
|
AM_CONDITIONAL(HAVE_ASM_AVX512,test x$have_asm_avx512 = xtrue )
|
||||||
AM_CONDITIONAL(HAVE_X86CPU, test x$have_x86cpu = xtrue)
|
AM_CONDITIONAL(HAVE_X86CPU, test x$have_x86cpu = xtrue)
|
||||||
AM_CONDITIONAL(HAVE_PPCCPU, test x$have_ppccpu = xtrue)
|
AM_CONDITIONAL(HAVE_PPCCPU, test x$have_ppccpu = xtrue)
|
||||||
AM_CONDITIONAL(ARCH_PPC, test x$have_ppccpu = xtrue)
|
AM_CONDITIONAL(ARCH_PPC, test x$have_ppccpu = xtrue)
|
||||||
@@ -841,6 +874,8 @@ AC_MSG_NOTICE([ SSE2 enabled : ${ac_cv_flag_sse2}])
|
|||||||
AC_MSG_NOTICE([ 3DNOW enabled : ${ac_cv_flag_3dnow}])
|
AC_MSG_NOTICE([ 3DNOW enabled : ${ac_cv_flag_3dnow}])
|
||||||
AC_MSG_NOTICE([ CMOV enabled : ${ac_cv_flag_cmov}])
|
AC_MSG_NOTICE([ CMOV enabled : ${ac_cv_flag_cmov}])
|
||||||
AC_MSG_NOTICE([ AVX enabled : ${ac_cv_flag_avx}])
|
AC_MSG_NOTICE([ AVX enabled : ${ac_cv_flag_avx}])
|
||||||
|
AC_MSG_NOTICE([ AVX2 enabled : ${ac_cv_flag_avx2}])
|
||||||
|
AC_MSG_NOTICE([ AVX-512 enabled : ${ac_cv_flag_avx512}])
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if test "$have_ppccpu" = "true" ; then
|
if test "$have_ppccpu" = "true" ; then
|
||||||
|
|||||||
@@ -172,7 +172,7 @@
|
|||||||
#define CONFUSION_FACTOR 0
|
#define CONFUSION_FACTOR 0
|
||||||
//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
|
//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
|
||||||
|
|
||||||
#define FIND_BEST_MAX_ITERATIONS 100
|
#define FIND_BEST_MAX_ITERATIONS 1024
|
||||||
|
|
||||||
|
|
||||||
#define is_aligned__(PTR,LEN) \
|
#define is_aligned__(PTR,LEN) \
|
||||||
@@ -340,6 +340,8 @@ static __inline__ void * __memcpy(void * to, const void * from, size_t n)
|
|||||||
//#endif
|
//#endif
|
||||||
|
|
||||||
|
|
||||||
|
static int BENCHMARK_WID = 1920;
|
||||||
|
static int BENCHMARK_HEI = 1080;
|
||||||
|
|
||||||
char *veejay_strncpy( char *dest, const char *src, size_t n )
|
char *veejay_strncpy( char *dest, const char *src, size_t n )
|
||||||
{
|
{
|
||||||
@@ -855,7 +857,7 @@ static void *sse_memcpy2(void * to, const void * from, size_t len)
|
|||||||
register uintptr_t delta;
|
register uintptr_t delta;
|
||||||
|
|
||||||
/* Align destination to SSE_MMREG_SIZE -boundary */
|
/* Align destination to SSE_MMREG_SIZE -boundary */
|
||||||
delta = ((uintptr_t)to)&(SSE_MMREG_SIZE-1);
|
delta = ((uintptr_t)to)&(SSE_MMREG_SIZE-1);
|
||||||
if(delta)
|
if(delta)
|
||||||
{
|
{
|
||||||
delta=SSE_MMREG_SIZE-delta;
|
delta=SSE_MMREG_SIZE-delta;
|
||||||
@@ -1003,28 +1005,249 @@ static void * sse_memcpy(void * to, const void * from, size_t len)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_ASM_AVX2
|
||||||
|
void *avx2_memcpy(void *to, const void *from, size_t len) {
|
||||||
|
void *retval = to;
|
||||||
|
if (len >= 128) {
|
||||||
|
uintptr_t delta = ((uintptr_t)to) & 31;
|
||||||
|
if (delta) {
|
||||||
|
delta = 32 - delta;
|
||||||
|
len -= delta;
|
||||||
|
__builtin_memcpy(to, from, delta);
|
||||||
|
}
|
||||||
|
size_t blocks = len / 128;
|
||||||
|
len %= 128;
|
||||||
|
for (size_t i = 0; i < blocks; i++) {
|
||||||
|
_mm_prefetch((const char *)from + 320, _MM_HINT_NTA);
|
||||||
|
_mm_prefetch((const char *)from + 352, _MM_HINT_NTA);
|
||||||
|
__m256i ymm0 = _mm256_loadu_si256((__m256i *)from);
|
||||||
|
__m256i ymm1 = _mm256_loadu_si256((__m256i *)(from + 32));
|
||||||
|
__m256i ymm2 = _mm256_loadu_si256((__m256i *)(from + 64));
|
||||||
|
__m256i ymm3 = _mm256_loadu_si256((__m256i *)(from + 96));
|
||||||
|
_mm256_stream_si256((__m256i *)to, ymm0);
|
||||||
|
_mm256_stream_si256((__m256i *)(to + 32), ymm1);
|
||||||
|
_mm256_stream_si256((__m256i *)(to + 64), ymm2);
|
||||||
|
_mm256_stream_si256((__m256i *)(to + 96), ymm3);
|
||||||
|
from = (const void *)((const char *)from + 128);
|
||||||
|
to = (void *)((char *)to + 128);
|
||||||
|
}
|
||||||
|
_mm_sfence();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len) {
|
||||||
|
__builtin_memcpy(to, from, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_ASM_AVX512
|
||||||
|
#define AVX512_MMREG_SIZE 64 // 512-bit register = 64 bytes
|
||||||
|
static void * avx512_memcpy(void *to, const void *from, size_t len) {
|
||||||
|
void *retval = to;
|
||||||
|
size_t i;
|
||||||
|
__asm__ __volatile__ (
|
||||||
|
" prefetchnta (%0)\n"
|
||||||
|
" prefetchnta 64(%0)\n"
|
||||||
|
" prefetchnta 128(%0)\n"
|
||||||
|
" prefetchnta 192(%0)\n"
|
||||||
|
" prefetchnta 256(%0)\n"
|
||||||
|
" prefetchnta 320(%0)\n"
|
||||||
|
" prefetchnta 384(%0)\n"
|
||||||
|
" prefetchnta 448(%0)\n"
|
||||||
|
" prefetchnta 512(%0)\n"
|
||||||
|
" prefetchnta 576(%0)\n"
|
||||||
|
" prefetchnta 640(%0)\n"
|
||||||
|
:: "r" (from));
|
||||||
|
|
||||||
|
if (len >= 512) {
|
||||||
|
register uintptr_t delta;
|
||||||
|
delta = ((uintptr_t)to) & (AVX512_MMREG_SIZE - 1);
|
||||||
|
if (delta) {
|
||||||
|
delta = AVX512_MMREG_SIZE - delta;
|
||||||
|
len -= delta;
|
||||||
|
memcpy(to, from, delta);
|
||||||
|
from = (char *)from + delta;
|
||||||
|
to = (char *)to + delta;
|
||||||
|
}
|
||||||
|
|
||||||
|
i = len >> 8; // len / 256
|
||||||
|
len &= 255;
|
||||||
|
if (((uintptr_t)from) & 63) {
|
||||||
|
for (; i > 0; i--) {
|
||||||
|
__asm__ __volatile__ (
|
||||||
|
"vmovups (%0), %%ymm0\n"
|
||||||
|
"vmovups 32(%0), %%ymm1\n"
|
||||||
|
"vmovups 64(%0), %%ymm2\n"
|
||||||
|
"vmovups 96(%0), %%ymm3\n"
|
||||||
|
"vmovups 128(%0), %%ymm4\n"
|
||||||
|
"vmovups 160(%0), %%ymm5\n"
|
||||||
|
"vmovups 192(%0), %%ymm6\n"
|
||||||
|
"vmovups 224(%0), %%ymm7\n"
|
||||||
|
"vmovntps %%ymm0, (%1)\n"
|
||||||
|
"vmovntps %%ymm1, 32(%1)\n"
|
||||||
|
"vmovntps %%ymm2, 64(%1)\n"
|
||||||
|
"vmovntps %%ymm3, 96(%1)\n"
|
||||||
|
"vmovntps %%ymm4, 128(%1)\n"
|
||||||
|
"vmovntps %%ymm5, 160(%1)\n"
|
||||||
|
"vmovntps %%ymm6, 192(%1)\n"
|
||||||
|
"vmovntps %%ymm7, 224(%1)\n"
|
||||||
|
:: "r" (from), "r" (to) : "memory");
|
||||||
|
from = ((const unsigned char *)from) + 256;
|
||||||
|
to = ((unsigned char *)to) + 256;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (; i > 0; i--) {
|
||||||
|
__asm__ __volatile__ (
|
||||||
|
"vmovaps (%0), %%ymm0\n"
|
||||||
|
"vmovaps 32(%0), %%ymm1\n"
|
||||||
|
"vmovaps 64(%0), %%ymm2\n"
|
||||||
|
"vmovaps 96(%0), %%ymm3\n"
|
||||||
|
"vmovaps 128(%0), %%ymm4\n"
|
||||||
|
"vmovaps 160(%0), %%ymm5\n"
|
||||||
|
"vmovaps 192(%0), %%ymm6\n"
|
||||||
|
"vmovaps 224(%0), %%ymm7\n"
|
||||||
|
"vmovntps %%ymm0, (%1)\n"
|
||||||
|
"vmovntps %%ymm1, 32(%1)\n"
|
||||||
|
"vmovntps %%ymm2, 64(%1)\n"
|
||||||
|
"vmovntps %%ymm3, 96(%1)\n"
|
||||||
|
"vmovntps %%ymm4, 128(%1)\n"
|
||||||
|
"vmovntps %%ymm5, 160(%1)\n"
|
||||||
|
"vmovntps %%ymm6, 192(%1)\n"
|
||||||
|
"vmovntps %%ymm7, 224(%1)\n"
|
||||||
|
:: "r" (from), "r" (to) : "memory");
|
||||||
|
from = ((const unsigned char *)from) + 256;
|
||||||
|
to = ((unsigned char *)to) + 256;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__asm__ __volatile__ ("sfence" ::: "memory");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len) memcpy(to, from, len);
|
||||||
|
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_ASM_AVX
|
#ifdef HAVE_ASM_AVX
|
||||||
static void * avx_memcpy(void * to, const void * from, size_t len)
|
|
||||||
|
static void* avx_memcpy(void *destination, const void *source, size_t size)
|
||||||
|
{
|
||||||
|
unsigned char *dst = (unsigned char*)destination;
|
||||||
|
const unsigned char *src = (const unsigned char*)source;
|
||||||
|
static size_t cachesize = 0x200000; // L3-cache size
|
||||||
|
size_t padding;
|
||||||
|
|
||||||
|
if (size <= 256) {
|
||||||
|
__memcpy(dst, src, size);
|
||||||
|
_mm256_zeroupper();
|
||||||
|
return destination;
|
||||||
|
}
|
||||||
|
|
||||||
|
// align destination to 16 bytes boundary
|
||||||
|
padding = (32 - (((size_t)dst) & 31)) & 31;
|
||||||
|
__m256i head = _mm256_loadu_si256((const __m256i*)src);
|
||||||
|
_mm256_storeu_si256((__m256i*)dst, head);
|
||||||
|
dst += padding;
|
||||||
|
src += padding;
|
||||||
|
size -= padding;
|
||||||
|
|
||||||
|
// medium size copy
|
||||||
|
if (size <= cachesize) {
|
||||||
|
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
|
||||||
|
|
||||||
|
for (; size >= 256; size -= 256) {
|
||||||
|
c0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
|
||||||
|
c1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
|
||||||
|
c2 = _mm256_loadu_si256(((const __m256i*)src) + 2);
|
||||||
|
c3 = _mm256_loadu_si256(((const __m256i*)src) + 3);
|
||||||
|
c4 = _mm256_loadu_si256(((const __m256i*)src) + 4);
|
||||||
|
c5 = _mm256_loadu_si256(((const __m256i*)src) + 5);
|
||||||
|
c6 = _mm256_loadu_si256(((const __m256i*)src) + 6);
|
||||||
|
c7 = _mm256_loadu_si256(((const __m256i*)src) + 7);
|
||||||
|
_mm_prefetch((const char*)(src + 512), _MM_HINT_NTA);
|
||||||
|
src += 256;
|
||||||
|
_mm256_storeu_si256((((__m256i*)dst) + 0), c0);
|
||||||
|
_mm256_storeu_si256((((__m256i*)dst) + 1), c1);
|
||||||
|
_mm256_storeu_si256((((__m256i*)dst) + 2), c2);
|
||||||
|
_mm256_storeu_si256((((__m256i*)dst) + 3), c3);
|
||||||
|
_mm256_storeu_si256((((__m256i*)dst) + 4), c4);
|
||||||
|
_mm256_storeu_si256((((__m256i*)dst) + 5), c5);
|
||||||
|
_mm256_storeu_si256((((__m256i*)dst) + 6), c6);
|
||||||
|
_mm256_storeu_si256((((__m256i*)dst) + 7), c7);
|
||||||
|
dst += 256;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else { // big memory copy
|
||||||
|
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
|
||||||
|
/* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */
|
||||||
|
|
||||||
|
_mm_prefetch((const char*)(src), _MM_HINT_NTA);
|
||||||
|
|
||||||
|
if ((((size_t)src) & 31) == 0) { // source aligned
|
||||||
|
for (; size >= 256; size -= 256) {
|
||||||
|
c0 = _mm256_load_si256(((const __m256i*)src) + 0);
|
||||||
|
c1 = _mm256_load_si256(((const __m256i*)src) + 1);
|
||||||
|
c2 = _mm256_load_si256(((const __m256i*)src) + 2);
|
||||||
|
c3 = _mm256_load_si256(((const __m256i*)src) + 3);
|
||||||
|
c4 = _mm256_load_si256(((const __m256i*)src) + 4);
|
||||||
|
c5 = _mm256_load_si256(((const __m256i*)src) + 5);
|
||||||
|
c6 = _mm256_load_si256(((const __m256i*)src) + 6);
|
||||||
|
c7 = _mm256_load_si256(((const __m256i*)src) + 7);
|
||||||
|
_mm_prefetch((const char*)(src + 512), _MM_HINT_NTA);
|
||||||
|
src += 256;
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 0), c0);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 1), c1);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 2), c2);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 3), c3);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 4), c4);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 5), c5);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 6), c6);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 7), c7);
|
||||||
|
dst += 256;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else { // source unaligned
|
||||||
|
for (; size >= 256; size -= 256) {
|
||||||
|
c0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
|
||||||
|
c1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
|
||||||
|
c2 = _mm256_loadu_si256(((const __m256i*)src) + 2);
|
||||||
|
c3 = _mm256_loadu_si256(((const __m256i*)src) + 3);
|
||||||
|
c4 = _mm256_loadu_si256(((const __m256i*)src) + 4);
|
||||||
|
c5 = _mm256_loadu_si256(((const __m256i*)src) + 5);
|
||||||
|
c6 = _mm256_loadu_si256(((const __m256i*)src) + 6);
|
||||||
|
c7 = _mm256_loadu_si256(((const __m256i*)src) + 7);
|
||||||
|
_mm_prefetch((const char*)(src + 512), _MM_HINT_NTA);
|
||||||
|
src += 256;
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 0), c0);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 1), c1);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 2), c2);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 3), c3);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 4), c4);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 5), c5);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 6), c6);
|
||||||
|
_mm256_stream_si256((((__m256i*)dst) + 7), c7);
|
||||||
|
dst += 256;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_mm_sfence();
|
||||||
|
}
|
||||||
|
|
||||||
|
__memcpy(dst, src, size);
|
||||||
|
_mm256_zeroupper();
|
||||||
|
|
||||||
|
return destination;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void * avx_memcpy2(void * to, const void * from, size_t len)
|
||||||
{
|
{
|
||||||
void *retval;
|
void *retval;
|
||||||
size_t i;
|
size_t i;
|
||||||
retval = to;
|
retval = to;
|
||||||
|
|
||||||
/* PREFETCH has effect even for MOVSB instruction ;) */
|
if(len >= 256)
|
||||||
__asm__ __volatile__ (
|
|
||||||
" prefetchnta (%0)\n"
|
|
||||||
" prefetchnta 32(%0)\n"
|
|
||||||
" prefetchnta 64(%0)\n"
|
|
||||||
" prefetchnta 96(%0)\n"
|
|
||||||
" prefetchnta 128(%0)\n"
|
|
||||||
" prefetchnta 160(%0)\n"
|
|
||||||
" prefetchnta 192(%0)\n"
|
|
||||||
" prefetchnta 224(%0)\n"
|
|
||||||
" prefetchnta 256(%0)\n"
|
|
||||||
" prefetchnta 288(%0)\n"
|
|
||||||
: : "r" (from) );
|
|
||||||
|
|
||||||
if(len >= MIN_LEN)
|
|
||||||
{
|
{
|
||||||
register uintptr_t delta;
|
register uintptr_t delta;
|
||||||
/* Align destinition to MMREG_SIZE -boundary */
|
/* Align destinition to MMREG_SIZE -boundary */
|
||||||
@@ -1035,53 +1258,64 @@ static void * avx_memcpy(void * to, const void * from, size_t len)
|
|||||||
len -= delta;
|
len -= delta;
|
||||||
small_memcpy(to, from, delta);
|
small_memcpy(to, from, delta);
|
||||||
}
|
}
|
||||||
i = len >> 7; /* len/128 */
|
i = len >> 8;
|
||||||
len&=127;
|
len&=255;
|
||||||
|
|
||||||
|
__asm__ __volatile__ (
|
||||||
|
"prefetchnta 64(%0)\n"
|
||||||
|
"prefetchnta 128(%0)\n"
|
||||||
|
"prefetchnta 192(%0)\n"
|
||||||
|
"prefetchnta 256(%0)\n"
|
||||||
|
: : "r" (from)
|
||||||
|
);
|
||||||
|
|
||||||
if(((uintptr_t)from) & 31)
|
if(((uintptr_t)from) & 31)
|
||||||
/* if SRC is misaligned */
|
|
||||||
for(; i>0; i--)
|
for(; i>0; i--)
|
||||||
{
|
{
|
||||||
__asm__ __volatile__ (
|
__asm__ __volatile__ (
|
||||||
"prefetchnta 320(%0)\n"
|
|
||||||
"prefetchnta 352(%0)\n"
|
|
||||||
"prefetchnta 384(%0)\n"
|
|
||||||
"prefetchnta 416(%0)\n"
|
|
||||||
"vmovups (%0), %%ymm0\n"
|
"vmovups (%0), %%ymm0\n"
|
||||||
"vmovups 32(%0), %%ymm1\n"
|
"vmovups 32(%0), %%ymm1\n"
|
||||||
"vmovups 64(%0), %%ymm2\n"
|
"vmovups 64(%0), %%ymm2\n"
|
||||||
"vmovups 96(%0), %%ymm3\n"
|
"vmovups 96(%0), %%ymm3\n"
|
||||||
|
"vmovups 128(%0), %%ymm4\n"
|
||||||
|
"vmovups 160(%0), %%ymm5\n"
|
||||||
|
"vmovups 192(%0), %%ymm6\n"
|
||||||
|
"vmovups 224(%0), %%ymm7\n"
|
||||||
"vmovntps %%ymm0, (%1)\n"
|
"vmovntps %%ymm0, (%1)\n"
|
||||||
"vmovntps %%ymm1, 32(%1)\n"
|
"vmovntps %%ymm1, 32(%1)\n"
|
||||||
"vmovntps %%ymm2, 64(%1)\n"
|
"vmovntps %%ymm2, 64(%1)\n"
|
||||||
"vmovntps %%ymm3, 96(%1)\n"
|
"vmovntps %%ymm3, 96(%1)\n"
|
||||||
|
"vmovntps %%ymm4, 128(%1)\n"
|
||||||
|
"vmovntps %%ymm5, 160(%1)\n"
|
||||||
|
"vmovntps %%ymm6, 192(%1)\n"
|
||||||
|
"vmovntps %%ymm7, 224(%1)\n"
|
||||||
:: "r" (from), "r" (to) : "memory");
|
:: "r" (from), "r" (to) : "memory");
|
||||||
from = ((const unsigned char *)from) + 128;
|
from = ((const unsigned char *)from) + 256;
|
||||||
to = ((unsigned char *)to) + 128;
|
to = ((unsigned char *)to) + 256;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
/*
|
|
||||||
Only if SRC is aligned on 16-byte boundary.
|
|
||||||
It allows to use movaps instead of movups, which required data
|
|
||||||
to be aligned or a general-protection exception (#GP) is generated.
|
|
||||||
*/
|
|
||||||
for(; i>0; i--)
|
for(; i>0; i--)
|
||||||
{
|
{
|
||||||
__asm__ __volatile__ (
|
__asm__ __volatile__ (
|
||||||
"prefetchnta 320(%0)\n"
|
|
||||||
"prefetchnta 352(%0)\n"
|
|
||||||
"prefetchnta 384(%0)\n"
|
|
||||||
"prefetchnta 416(%0)\n"
|
|
||||||
"vmovaps (%0), %%ymm0\n"
|
"vmovaps (%0), %%ymm0\n"
|
||||||
"vmovaps 32(%0), %%ymm1\n"
|
"vmovaps 32(%0), %%ymm1\n"
|
||||||
"vmovaps 64(%0), %%ymm2\n"
|
"vmovaps 64(%0), %%ymm2\n"
|
||||||
"vmovaps 96(%0), %%ymm3\n"
|
"vmovaps 96(%0), %%ymm3\n"
|
||||||
|
"vmovaps 128(%0), %%ymm4\n"
|
||||||
|
"vmovaps 160(%0), %%ymm5\n"
|
||||||
|
"vmovaps 192(%0), %%ymm6\n"
|
||||||
|
"vmovaps 224(%0), %%ymm7\n"
|
||||||
"vmovntps %%ymm0, (%1)\n"
|
"vmovntps %%ymm0, (%1)\n"
|
||||||
"vmovntps %%ymm1, 32(%1)\n"
|
"vmovntps %%ymm1, 32(%1)\n"
|
||||||
"vmovntps %%ymm2, 64(%1)\n"
|
"vmovntps %%ymm2, 64(%1)\n"
|
||||||
"vmovntps %%ymm3, 96(%1)\n"
|
"vmovntps %%ymm3, 96(%1)\n"
|
||||||
|
"vmovntps %%ymm4, 128(%1)\n"
|
||||||
|
"vmovntps %%ymm5, 160(%1)\n"
|
||||||
|
"vmovntps %%ymm6, 192(%1)\n"
|
||||||
|
"vmovntps %%ymm7, 224(%1)\n"
|
||||||
:: "r" (from), "r" (to) : "memory");
|
:: "r" (from), "r" (to) : "memory");
|
||||||
from = ((const unsigned char *)from) + 128;
|
from = ((const unsigned char *)from) + 256;
|
||||||
to = ((unsigned char *)to) + 128;
|
to = ((unsigned char *)to) + 256;
|
||||||
}
|
}
|
||||||
/* since movntq is weakly-ordered, a "sfence"
|
/* since movntq is weakly-ordered, a "sfence"
|
||||||
* is needed to become ordered again. */
|
* is needed to become ordered again. */
|
||||||
@@ -1230,11 +1464,11 @@ static void *fast_memcpy(void * to, const void * from, size_t len)
|
|||||||
#ifndef HAVE_ONLY_MMX1
|
#ifndef HAVE_ONLY_MMX1
|
||||||
/* PREFETCH has effect even for MOVSB instruction ;) */
|
/* PREFETCH has effect even for MOVSB instruction ;) */
|
||||||
__asm__ volatile (
|
__asm__ volatile (
|
||||||
PREFETCH" (%0)\n"
|
PREFETCH" (%0)\n"
|
||||||
PREFETCH" 64(%0)\n"
|
PREFETCH" 64(%0)\n"
|
||||||
PREFETCH" 128(%0)\n"
|
PREFETCH" 128(%0)\n"
|
||||||
PREFETCH" 192(%0)\n"
|
PREFETCH" 192(%0)\n"
|
||||||
PREFETCH" 256(%0)\n"
|
PREFETCH" 256(%0)\n"
|
||||||
: : "r" (from) );
|
: : "r" (from) );
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -1249,7 +1483,7 @@ static void *fast_memcpy(void * to, const void * from, size_t len)
|
|||||||
len -= delta;
|
len -= delta;
|
||||||
small_memcpy(to, from, delta);
|
small_memcpy(to, from, delta);
|
||||||
}
|
}
|
||||||
i = len >> 6; /* len/64 */
|
i = len >> 6; /* len/64 */
|
||||||
len&=63;
|
len&=63;
|
||||||
/*
|
/*
|
||||||
This algorithm is top effective when the code consequently
|
This algorithm is top effective when the code consequently
|
||||||
@@ -1337,10 +1571,10 @@ static void *fast_memcpy(void * to, const void * from, size_t len)
|
|||||||
"xor %%"REG_a", %%"REG_a" \n\t"
|
"xor %%"REG_a", %%"REG_a" \n\t"
|
||||||
ASMALIGN(4)
|
ASMALIGN(4)
|
||||||
"1: \n\t"
|
"1: \n\t"
|
||||||
"movl (%0, %%"REG_a"), %%ecx \n\t"
|
"movl (%0, %%"REG_a"), %%ecx \n\t"
|
||||||
"movl 32(%0, %%"REG_a"), %%ecx \n\t"
|
"movl 32(%0, %%"REG_a"), %%ecx \n\t"
|
||||||
"movl 64(%0, %%"REG_a"), %%ecx \n\t"
|
"movl 64(%0, %%"REG_a"), %%ecx \n\t"
|
||||||
"movl 96(%0, %%"REG_a"), %%ecx \n\t"
|
"movl 96(%0, %%"REG_a"), %%ecx \n\t"
|
||||||
"add $128, %%"REG_a" \n\t"
|
"add $128, %%"REG_a" \n\t"
|
||||||
"cmp %3, %%"REG_a" \n\t"
|
"cmp %3, %%"REG_a" \n\t"
|
||||||
" jb 1b \n\t"
|
" jb 1b \n\t"
|
||||||
@@ -1393,7 +1627,7 @@ static void *fast_memcpy(void * to, const void * from, size_t len)
|
|||||||
{
|
{
|
||||||
__asm__ volatile (
|
__asm__ volatile (
|
||||||
#ifndef HAVE_ONLY_MMX1
|
#ifndef HAVE_ONLY_MMX1
|
||||||
PREFETCH" 320(%0)\n"
|
PREFETCH" 320(%0)\n"
|
||||||
#endif
|
#endif
|
||||||
"movq (%0), %%mm0\n"
|
"movq (%0), %%mm0\n"
|
||||||
"movq 8(%0), %%mm1\n"
|
"movq 8(%0), %%mm1\n"
|
||||||
@@ -1614,9 +1848,9 @@ static inline void memcpy_neon_256( uint8_t *dst, const uint8_t *src )
|
|||||||
: [src] "+r" (src), [dst] "+r" (dst)
|
: [src] "+r" (src), [dst] "+r" (dst)
|
||||||
:: "memory" ,
|
:: "memory" ,
|
||||||
"d0", "d1", "d2", "d3", "d4", "d5", "d6" , "d7",
|
"d0", "d1", "d2", "d3", "d4", "d5", "d6" , "d7",
|
||||||
"d8", "d9", "d10","d11","d12","d13","d14", "d15",
|
"d8", "d9", "d10","d11","d12","d13","d14", "d15",
|
||||||
"d16","d17","d18","d19","d20","d21","d22", "d23",
|
"d16","d17","d18","d19","d20","d21","d22", "d23",
|
||||||
"d24","d23","d24","d25","d26","d27","d28", "d29",
|
"d24","d23","d24","d25","d26","d27","d28", "d29",
|
||||||
"d30","d31"
|
"d30","d31"
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -1911,7 +2145,7 @@ void memset_asimd_v2(void *dst, uint8_t val, size_t len) {
|
|||||||
}
|
}
|
||||||
void memset_asimd_v4(void *dst, uint8_t val, size_t len) {
|
void memset_asimd_v4(void *dst, uint8_t val, size_t len) {
|
||||||
if( len == 0 || NULL == dst )
|
if( len == 0 || NULL == dst )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
uint8x16_t v = vdupq_n_u8(val);
|
uint8x16_t v = vdupq_n_u8(val);
|
||||||
size_t multiple_of_16 = len & ~0xF;
|
size_t multiple_of_16 = len & ~0xF;
|
||||||
@@ -1931,7 +2165,7 @@ void memset_asimd_64(uint8_t *dst, uint8_t value, size_t size) {
|
|||||||
if( size == 0 || NULL == dst )
|
if( size == 0 || NULL == dst )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
uint8x16_t value_v = vdupq_n_u8(value);
|
uint8x16_t value_v = vdupq_n_u8(value);
|
||||||
|
|
||||||
size_t num_blocks = size / 64;
|
size_t num_blocks = size / 64;
|
||||||
size_t remaining_bytes = size % 64;
|
size_t remaining_bytes = size % 64;
|
||||||
@@ -1972,7 +2206,7 @@ void memset_asimd_32(uint8_t *dst, uint8_t value, size_t size) {
|
|||||||
if( size == 0 || dst == NULL )
|
if( size == 0 || dst == NULL )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
uint8x16_t value_v = vdupq_n_u8(value);
|
uint8x16_t value_v = vdupq_n_u8(value);
|
||||||
|
|
||||||
size_t num_blocks = size / 32;
|
size_t num_blocks = size / 32;
|
||||||
size_t remaining_bytes = size % 32;
|
size_t remaining_bytes = size % 32;
|
||||||
@@ -2050,6 +2284,13 @@ static struct {
|
|||||||
#endif
|
#endif
|
||||||
#ifdef HAVE_ASM_AVX
|
#ifdef HAVE_ASM_AVX
|
||||||
{ "AVX optimized memcpy()", (void*) avx_memcpy, 0,AV_CPU_FLAG_AVX },
|
{ "AVX optimized memcpy()", (void*) avx_memcpy, 0,AV_CPU_FLAG_AVX },
|
||||||
|
{ "AVX simple memcpy()", (void*) avx_memcpy2, 0, AV_CPU_FLAG_AVX },
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_ASM_AVX2
|
||||||
|
{ "AVX2 optimized memcpy()", (void*) avx2_memcpy, 0, AV_CPU_FLAG_AVX2 },
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_ASM_AVX512
|
||||||
|
{ "AVX-512 optimized memcpy()", (void*) avx512_memcpy, 0, AV_CPU_FLAG_AVX512 },
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAVE_ASM_MMX
|
#ifdef HAVE_ASM_MMX
|
||||||
{ "MMX optimized memcpy()", (void*) mmx_memcpy, 0,AV_CPU_FLAG_MMX },
|
{ "MMX optimized memcpy()", (void*) mmx_memcpy, 0,AV_CPU_FLAG_MMX },
|
||||||
@@ -2131,14 +2372,14 @@ static struct {
|
|||||||
{ "memset align 32 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_32,0,0 },
|
{ "memset align 32 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_32,0,0 },
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAVE_ASM_SSE4_1
|
#ifdef HAVE_ASM_SSE4_1
|
||||||
{ "SSE4_1 memset()", (void*) sse41_memset,0, AV_CPU_FLAG_SSE4},
|
{ "SSE4_1 memset()", (void*) sse41_memset,0, AV_CPU_FLAG_SSE4},
|
||||||
{ "SSE4_1 memset() v2", (void*) sse41_memset_v2,0, AV_CPU_FLAG_SSE4},
|
{ "SSE4_1 memset() v2", (void*) sse41_memset_v2,0, AV_CPU_FLAG_SSE4},
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAVE_ASM_SSE4_2
|
#ifdef HAVE_ASM_SSE4_2
|
||||||
{ "SSE4_2 unaligned memset()", (void*) sse42_memset,0, AV_CPU_FLAG_SSE42},
|
{ "SSE4_2 unaligned memset()", (void*) sse42_memset,0, AV_CPU_FLAG_SSE42},
|
||||||
{ "SSE4_2 aligned memset()", (void*) sse42_aligned_memset, 0, AV_CPU_FLAG_SSE42 },
|
{ "SSE4_2 aligned memset()", (void*) sse42_aligned_memset, 0, AV_CPU_FLAG_SSE42 },
|
||||||
#endif
|
#endif
|
||||||
{ "64-bit word memset()", (void*) memset_64, 0, 0},
|
{ "64-bit word memset()", (void*) memset_64, 0, 0},
|
||||||
|
|
||||||
{ NULL, NULL, 0, 0},
|
{ NULL, NULL, 0, 0},
|
||||||
};
|
};
|
||||||
@@ -2176,11 +2417,7 @@ static int set_user_selected_memcpy()
|
|||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
veejay_msg(VEEJAY_MSG_ERROR, "No valid memcpy method selected, please use one of the following:");
|
veejay_msg(VEEJAY_MSG_INFO, "Using memcpy method '%s'", memcpy_method[1].name );
|
||||||
for( i = 1; memcpy_method[i].name; i ++ ) {
|
|
||||||
veejay_msg(VEEJAY_MSG_ERROR, "\t\"%s\"", memcpy_method[i].name);
|
|
||||||
}
|
|
||||||
veejay_msg(VEEJAY_MSG_ERROR, "Using memcpy method '%s'", memcpy_method[1].name );
|
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -2196,10 +2433,6 @@ static int set_user_selected_memset()
|
|||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
veejay_msg(VEEJAY_MSG_ERROR, "No valid memset method selected, please use one of the following:");
|
|
||||||
for( i = 1; memset_method[i].name; i ++ ) {
|
|
||||||
veejay_msg(VEEJAY_MSG_ERROR, "\t\"%s\"", memset_method[i].name);
|
|
||||||
}
|
|
||||||
veejay_msg(VEEJAY_MSG_ERROR, "Using memset method '%s'", memset_method[1].name );
|
veejay_msg(VEEJAY_MSG_ERROR, "Using memset method '%s'", memset_method[1].name );
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
@@ -2208,7 +2441,7 @@ static int set_user_selected_memset()
|
|||||||
static void mem_fill_block(uint8_t *dst, size_t len) {
|
static void mem_fill_block(uint8_t *dst, size_t len) {
|
||||||
int i;
|
int i;
|
||||||
for( i = 0; i < len ; i ++ ) {
|
for( i = 0; i < len ; i ++ ) {
|
||||||
dst[i] = i % 255;
|
dst[i] = i % 256;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2235,17 +2468,17 @@ void find_best_memcpy()
|
|||||||
double t;
|
double t;
|
||||||
uint8_t *buf1, *buf2, *validbuf;
|
uint8_t *buf1, *buf2, *validbuf;
|
||||||
int i, k;
|
int i, k;
|
||||||
int bufsize = 10 * 1048576;
|
int bufsize = (BENCHMARK_WID * BENCHMARK_HEI * 3);
|
||||||
|
|
||||||
if (!(buf1 = (uint8_t*) malloc( bufsize * sizeof(uint8_t))))
|
if (!(buf1 = (uint8_t*) vj_malloc( bufsize * sizeof(uint8_t))))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!(buf2 = (uint8_t*) malloc( bufsize * sizeof(uint8_t)))) {
|
if (!(buf2 = (uint8_t*) vj_malloc( bufsize * sizeof(uint8_t)))) {
|
||||||
free( buf1 );
|
free( buf1 );
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!(validbuf = (uint8_t*) malloc( bufsize * sizeof(uint8_t)))) {
|
if (!(validbuf = (uint8_t*) vj_malloc( bufsize * sizeof(uint8_t)))) {
|
||||||
free( buf1 );
|
free( buf1 );
|
||||||
free( buf2 );
|
free( buf2 );
|
||||||
return;
|
return;
|
||||||
@@ -2267,7 +2500,7 @@ void find_best_memcpy()
|
|||||||
memset(buf2, 0, bufsize);
|
memset(buf2, 0, bufsize);
|
||||||
mem_fill_block(validbuf, bufsize);
|
mem_fill_block(validbuf, bufsize);
|
||||||
|
|
||||||
for( i = 1; memcpy_method[i].name; i ++ ) {
|
for( i = 1; memcpy_method[i].name != NULL; i ++ ) {
|
||||||
|
|
||||||
if( memcpy_method[i].cpu_require && !(cpu_flags & memcpy_method[i].cpu_require ) ) {
|
if( memcpy_method[i].cpu_require && !(cpu_flags & memcpy_method[i].cpu_require ) ) {
|
||||||
memcpy_method[i].t = 0.0;
|
memcpy_method[i].t = 0.0;
|
||||||
@@ -2284,13 +2517,18 @@ void find_best_memcpy()
|
|||||||
t = 0;
|
t = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if( t > 0 )
|
||||||
|
veejay_msg(VEEJAY_MSG_INFO, "method '%s' completed in %g seconds", memcpy_method[i].name, t );
|
||||||
|
else
|
||||||
|
veejay_msg(VEEJAY_MSG_WARNING, "method '%s' fails validation");
|
||||||
|
|
||||||
memcpy_method[i].t = t;
|
memcpy_method[i].t = t;
|
||||||
}
|
}
|
||||||
|
|
||||||
for( i = 1; memcpy_method[i].name; i ++ ) {
|
for( i = 1; memcpy_method[i].name != NULL; i ++ ) {
|
||||||
if(best == 0 ) {
|
if(best == 0 ) {
|
||||||
best = i;
|
best = i;
|
||||||
t = memcpy_method[i].t;
|
t = memcpy_method[i].t;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2314,6 +2552,7 @@ set_best_memcpy_method:
|
|||||||
selected_best_memcpy = best;
|
selected_best_memcpy = best;
|
||||||
|
|
||||||
veejay_msg(VEEJAY_MSG_INFO, "Selected %s", memcpy_method[best].name);
|
veejay_msg(VEEJAY_MSG_INFO, "Selected %s", memcpy_method[best].name);
|
||||||
|
veejay_msg(VEEJAY_MSG_WARNING, "export VEEJAY_MEMCPY_METHOD=\"%s\"", memcpy_method[best].name );
|
||||||
}
|
}
|
||||||
|
|
||||||
void find_best_memset()
|
void find_best_memset()
|
||||||
@@ -2326,13 +2565,13 @@ void find_best_memset()
|
|||||||
double t;
|
double t;
|
||||||
char *buf1, *buf2;
|
char *buf1, *buf2;
|
||||||
int i, k;
|
int i, k;
|
||||||
int bufsize = 10 * 1048576;
|
int bufsize = (BENCHMARK_WID * BENCHMARK_HEI * 3);
|
||||||
int cpu_flags = av_get_cpu_flags();
|
int cpu_flags = av_get_cpu_flags();
|
||||||
|
|
||||||
if (!(buf1 = (char*) malloc( bufsize * sizeof(char) )))
|
if (!(buf1 = (char*) vj_malloc( bufsize * sizeof(char) )))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) {
|
if (!(buf2 = (char*) vj_malloc( bufsize * sizeof(char) ))) {
|
||||||
free( buf1 );
|
free( buf1 );
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -2342,7 +2581,7 @@ void find_best_memset()
|
|||||||
memset( buf1, 0, bufsize * sizeof(char));
|
memset( buf1, 0, bufsize * sizeof(char));
|
||||||
memset( buf2, 0, bufsize * sizeof(char));
|
memset( buf2, 0, bufsize * sizeof(char));
|
||||||
|
|
||||||
for (i=1; memset_method[i].name; i++)
|
for (i=1; memset_method[i].name != NULL; i++)
|
||||||
{
|
{
|
||||||
if( memset_method[i].cpu_require && !(cpu_flags & memset_method[i].cpu_require ) ) {
|
if( memset_method[i].cpu_require && !(cpu_flags & memset_method[i].cpu_require ) ) {
|
||||||
memset_method[i].t= 0;
|
memset_method[i].t= 0;
|
||||||
@@ -2354,7 +2593,9 @@ void find_best_memset()
|
|||||||
memset_method[i].function( buf1 , 0 , bufsize );
|
memset_method[i].function( buf1 , 0 , bufsize );
|
||||||
}
|
}
|
||||||
t = get_time() - t;
|
t = get_time() - t;
|
||||||
|
|
||||||
|
veejay_msg(VEEJAY_MSG_INFO, "method '%s' completed in %g seconds", memset_method[i].name, t );
|
||||||
|
|
||||||
memset_method[i].t = t;
|
memset_method[i].t = t;
|
||||||
|
|
||||||
if (best == 0 || t < memset_method[best].t)
|
if (best == 0 || t < memset_method[best].t)
|
||||||
@@ -2374,19 +2615,25 @@ set_best_memset_method:
|
|||||||
|
|
||||||
selected_best_memset = best;
|
selected_best_memset = best;
|
||||||
veejay_msg(VEEJAY_MSG_INFO, "Selected %s", memset_method[best].name);
|
veejay_msg(VEEJAY_MSG_INFO, "Selected %s", memset_method[best].name);
|
||||||
|
veejay_msg(VEEJAY_MSG_WARNING, "export VEEJAY_MEMSET_METHOD=\"%s\"", memset_method[best].name );
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void vj_mem_set_defaults() {
|
void vj_mem_set_defaults(int w, int h) {
|
||||||
#ifdef STRICT_CHECKING
|
|
||||||
veejay_memset = memset;
|
if( w > 0 )
|
||||||
veejay_memcpy = memcpy;
|
BENCHMARK_WID = w;
|
||||||
veejay_msg(VEEJAY_MSG_WARNING, "Using default memcpy() / memset() functions");
|
if( h > 0 )
|
||||||
#else
|
BENCHMARK_HEI = h;
|
||||||
|
|
||||||
veejay_memset = memset_method[1].function;
|
veejay_memset = memset_method[1].function;
|
||||||
veejay_memcpy = memcpy_method[1].function;
|
veejay_memcpy = memcpy_method[1].function;
|
||||||
#endif
|
|
||||||
|
set_user_selected_memcpy();
|
||||||
|
set_user_selected_memset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void vj_frame_copy_job( void *arg ) {
|
static void vj_frame_copy_job( void *arg ) {
|
||||||
int i;
|
int i;
|
||||||
vj_task_arg_t *info = (vj_task_arg_t*) arg;
|
vj_task_arg_t *info = (vj_task_arg_t*) arg;
|
||||||
@@ -2458,8 +2705,8 @@ void vj_frame_slow_threaded( uint8_t **p0_buffer, uint8_t **p1_buffer, uint8_t *
|
|||||||
|
|
||||||
#ifdef HAVE_ASM_MMX
|
#ifdef HAVE_ASM_MMX
|
||||||
__asm __volatile(_EMMS" \n\t"
|
__asm __volatile(_EMMS" \n\t"
|
||||||
SFENCE" \n\t"
|
SFENCE" \n\t"
|
||||||
:::"memory");
|
:::"memory");
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
@@ -2635,30 +2882,34 @@ static void run_benchmark_test(int n_tasks, benchmark_func f, const char *str, i
|
|||||||
double stats[N];
|
double stats[N];
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
double fastest = 0.0;
|
double fastest = 0.0;
|
||||||
|
double slowest = 0.0;
|
||||||
float work_size = (planes[0] + planes[1] + planes[2] + planes[3]) / 1048576.0f;
|
float work_size = (planes[0] + planes[1] + planes[2] + planes[3]) / 1048576.0f;
|
||||||
|
|
||||||
veejay_msg(VEEJAY_MSG_INFO, "run test '%s' (%dx) on chunks of %2.2f MB:", str, N, work_size );
|
veejay_msg(VEEJAY_MSG_INFO, "run %dx test '%s' on chunks of %2.2f MB:", N, str, work_size );
|
||||||
|
|
||||||
for( i = 0; i < N; i ++ )
|
for( i = 0; i < N; i ++ )
|
||||||
{
|
{
|
||||||
stats[i] = f( n_frames, n_tasks, source, dest, planes );
|
stats[i] = f( n_frames, n_tasks, source, dest, planes );
|
||||||
if( stats[i] > fastest )
|
if(i == 0 || stats[i] < fastest )
|
||||||
fastest = stats[i];
|
fastest = stats[i];
|
||||||
|
|
||||||
|
if( stats[i] > slowest )
|
||||||
|
slowest = stats[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
double slowest=fastest;
|
|
||||||
for( i = 0; i < N; i ++ )
|
for( i = 0; i < N; i ++ )
|
||||||
{
|
{
|
||||||
if( stats[i] < fastest ) {
|
|
||||||
fastest = stats[i];
|
|
||||||
}
|
|
||||||
sum += stats[i];
|
sum += stats[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
double average = (sum / N);
|
double average = (sum / N);
|
||||||
|
|
||||||
veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %g, worst is %g, average is %g",str, fastest, slowest, average );
|
double fastest_ms = fastest * 1000000.0;
|
||||||
|
double slowest_ms = slowest * 1000000.0;
|
||||||
|
double average_ms = average * 1000000.0;
|
||||||
|
|
||||||
|
veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %gms, worst is %gms, average is %gms",str, fastest_ms, slowest_ms, average_ms );
|
||||||
}
|
}
|
||||||
|
|
||||||
static void benchmark_tasks(unsigned int n_tasks, long n_frames, int w, int h)
|
static void benchmark_tasks(unsigned int n_tasks, long n_frames, int w, int h)
|
||||||
@@ -2708,6 +2959,10 @@ void benchmark_veejay(int w, int h)
|
|||||||
|
|
||||||
veejay_msg(VEEJAY_MSG_INFO, "Starting benchmark %dx%d YUVP 4:2:2 (100 frames)", w,h);
|
veejay_msg(VEEJAY_MSG_INFO, "Starting benchmark %dx%d YUVP 4:2:2 (100 frames)", w,h);
|
||||||
|
|
||||||
|
find_best_memcpy();
|
||||||
|
|
||||||
|
find_best_memset();
|
||||||
|
|
||||||
init_parallel_tasks( 0 );
|
init_parallel_tasks( 0 );
|
||||||
|
|
||||||
benchmark_tasks( 0,100,w,h );
|
benchmark_tasks( 0,100,w,h );
|
||||||
|
|||||||
@@ -146,7 +146,7 @@ int mem_align_size()
|
|||||||
return MEM_ALIGNMENT_SIZE;
|
return MEM_ALIGNMENT_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
void vj_mem_init(void)
|
void vj_mem_init(int w, int h)
|
||||||
{
|
{
|
||||||
#if defined(ARCH_X86) || defined(ARCH_X86_X64) || defined(HAVE_ARM)
|
#if defined(ARCH_X86) || defined(ARCH_X86_X64) || defined(HAVE_ARM)
|
||||||
CACHE_LINE_SIZE = get_cache_line_size();
|
CACHE_LINE_SIZE = get_cache_line_size();
|
||||||
@@ -158,7 +158,7 @@ void vj_mem_init(void)
|
|||||||
#endif
|
#endif
|
||||||
//find_best_memcpy();
|
//find_best_memcpy();
|
||||||
//find_best_memset();
|
//find_best_memset();
|
||||||
vj_mem_set_defaults();
|
vj_mem_set_defaults(w,h);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vj_mem_optimize() {
|
void vj_mem_optimize() {
|
||||||
|
|||||||
@@ -24,9 +24,9 @@
|
|||||||
|
|
||||||
extern void *(* veejay_memcpy)(void *to, const void *from, size_t len);
|
extern void *(* veejay_memcpy)(void *to, const void *from, size_t len);
|
||||||
extern void *(* veejay_memset)(void *to, uint8_t val, size_t len);
|
extern void *(* veejay_memset)(void *to, uint8_t val, size_t len);
|
||||||
extern void vj_mem_init(void);
|
extern void vj_mem_init(int w, int h);
|
||||||
extern void vj_mem_optimize();
|
extern void vj_mem_optimize();
|
||||||
extern void vj_mem_set_defaults();
|
extern void vj_mem_set_defaults(int w, int h);
|
||||||
extern int vj_mem_threaded_init(int w, int h);
|
extern int vj_mem_threaded_init(int w, int h);
|
||||||
extern void *vj_malloc_(size_t size);
|
extern void *vj_malloc_(size_t size);
|
||||||
extern void *vj_calloc_(size_t size );
|
extern void *vj_calloc_(size_t size );
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
-march=silvermont
|
-march=alderlake
|
||||||
|
|||||||
@@ -511,7 +511,7 @@ int main(int argc, char *argv[])
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
vj_mem_init();
|
vj_mem_init(0,0);
|
||||||
|
|
||||||
reconnect();
|
reconnect();
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user