mirror of
https://github.com/game-stop/veejay.git
synced 2025-12-14 11:50:02 +01:00
add fastarm memcpy
This commit is contained in:
@@ -376,6 +376,27 @@ esac
|
||||
|
||||
CFLAGS="$CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES"
|
||||
|
||||
if test x$host_alias != x; then
|
||||
dnl Cross compiling
|
||||
AC_MSG_CHECKING(sub-architecture settings)
|
||||
if test x$have_x86cpu = xtrue; then
|
||||
host_mod_cpu=`echo $host_cpu|tr _ -`
|
||||
ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu"
|
||||
AC_MSG_RESULT($ARCHFLAGS)
|
||||
fi
|
||||
else
|
||||
AC_MSG_CHECKING(sub-architecture settings)
|
||||
|
||||
chmod +x $srcdir/cpuinfo.sh
|
||||
|
||||
if test "$arch_target" = "auto"; then
|
||||
TMP=`$srcdir/cpuinfo.sh`
|
||||
ARCHFLAGS=`cat veejay.arch`
|
||||
else
|
||||
ARCHFLAGS="-mtune=generic"
|
||||
fi
|
||||
AC_MSG_RESULT($ARCHFLAGS)
|
||||
fi
|
||||
|
||||
dnl ARM architecture detect NEON and set CFLAGS
|
||||
if test x$have_arm = xtrue
|
||||
@@ -392,8 +413,11 @@ then
|
||||
if test $ac_cv_flag_neon = yes ; then
|
||||
AC_DEFINE(HAVE_ARM_NEON,1,[Compiling in NEON support])
|
||||
USER_CFLAGS="-mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad $USER_CFLAGS"
|
||||
FASTARM_CFLAGS="$ARCHFLAGS -Wa,-march=armv7-a -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB"
|
||||
|
||||
else
|
||||
USER_CFLAGS="-march=native -ftree-vectorize $USER_CFLAGS"
|
||||
FASTARM_CFLAGS="$ARCHFLAGS -Wa, -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB"
|
||||
fi
|
||||
|
||||
if test "x$enable_debug" != "xyes" ; then
|
||||
@@ -406,6 +430,8 @@ then
|
||||
SUBSAMPLE_CFLAGS="$USER_CFLAGS"
|
||||
VJE_CFLAGS="$USER_CFLAGS"
|
||||
CFLAGS="$USER_CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES"
|
||||
|
||||
AC_SUBST(FASTARM_CFLAGS)
|
||||
fi
|
||||
|
||||
dnl This flag is used for PROGRAMS not SHARED LIBRARIES. PIC code is required
|
||||
@@ -584,28 +610,6 @@ EOF
|
||||
fi
|
||||
fi
|
||||
|
||||
if test x$host_alias != x; then
|
||||
dnl Cross compiling
|
||||
AC_MSG_CHECKING(sub-architecture settings)
|
||||
if test x$have_x86cpu = xtrue; then
|
||||
host_mod_cpu=`echo $host_cpu|tr _ -`
|
||||
ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu"
|
||||
AC_MSG_RESULT($ARCHFLAGS)
|
||||
fi
|
||||
else
|
||||
AC_MSG_CHECKING(sub-architecture settings)
|
||||
|
||||
chmod +x $srcdir/cpuinfo.sh
|
||||
|
||||
if test "$arch_target" = "auto"; then
|
||||
TMP=`$srcdir/cpuinfo.sh`
|
||||
ARCHFLAGS=`cat veejay.arch`
|
||||
else
|
||||
ARCHFLAGS="-mtune=generic"
|
||||
fi
|
||||
AC_MSG_RESULT($ARCHFLAGS)
|
||||
fi
|
||||
|
||||
have_mjpegtools=false
|
||||
AC_SUBST(MJPEGTOOLS_CFLAGS)
|
||||
AC_SUBST(MJPGETOOLS_LIBS)
|
||||
@@ -1074,6 +1078,7 @@ AM_CONDITIONAL(HAVE_JPEG,test x$have_jpeg = xtrue)
|
||||
AM_CONDITIONAL(HAVE_LIBLO,test x$have_liblo = xtrue)
|
||||
AM_CONDITIONAL(HAVE_FREETYPE2, test x$have_freetype2 = xtrue)
|
||||
AM_CONDITIONAL(HAVE_MJPEGTOOLS, test x$have_mjpegtools = xtrue )
|
||||
AM_CONDITIONAL(HAVE_ARM, test x$have_arm = xtrue )
|
||||
dnl *********************************************************************
|
||||
dnl Check for what warnings we want gcc to use and adjust the CFLAGS
|
||||
dnl as needed. This only works for GCC.
|
||||
@@ -1161,6 +1166,7 @@ fi
|
||||
|
||||
AC_CONFIG_FILES([
|
||||
thirdparty/Makefile
|
||||
thirdparty/fastarm/Makefile
|
||||
thirdparty/aclib/Makefile
|
||||
thirdparty/bio2jack/Makefile
|
||||
thirdparty/libhash/Makefile
|
||||
|
||||
@@ -141,6 +141,10 @@
|
||||
#include <libvje/vje.h>
|
||||
#include <veejay/vj-task.h>
|
||||
#include <libavutil/cpu.h>
|
||||
#ifdef HAVE_ARM
|
||||
#include <fastarm/new_arm.h>
|
||||
#endif
|
||||
|
||||
#define BUFSIZE 1024
|
||||
|
||||
|
||||
@@ -157,37 +161,12 @@
|
||||
static int selected_best_memcpy = 1;
|
||||
static int selected_best_memset = 1;
|
||||
|
||||
#ifdef HAVE_POSIX_TIMERS
|
||||
static int64_t _x_gettime(void)
|
||||
static double get_time()
|
||||
{
|
||||
struct timespec tm;
|
||||
return (clock_gettime(CLOCK_THREAD_CPUTIME_ID,&tm) == -1 )
|
||||
? times(NULL)
|
||||
: (int64_t) tm.tv_sec * 1e9 + tm.tv_nsec;
|
||||
struct timespec ts;
|
||||
clock_gettime( CLOCK_REALTIME, &ts );
|
||||
return (double) ts.tv_sec + (double) ts.tv_nsec / 1000000000.0;
|
||||
}
|
||||
#define rdtsc(x) _x_gettime()
|
||||
#elif (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(HAVE_SYS_TIMES_H)
|
||||
static int64_t rdtsc(int cpu_flags)
|
||||
{
|
||||
int64_t x;
|
||||
if( cpu_flags & AV_CPU_FLAGS_MMX ) {
|
||||
__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
|
||||
return x;
|
||||
} else {
|
||||
return times(NULL);
|
||||
}
|
||||
}
|
||||
#else
|
||||
static uint64_t rdtsc(int cpu_flags)
|
||||
{
|
||||
#ifdef HAVE_SYS_TIMES_H
|
||||
struct tms tp;
|
||||
return times(&tp);
|
||||
#else
|
||||
return clock();
|
||||
#endif
|
||||
}
|
||||
#endif /* HAVE_SYS_TIMES_H */
|
||||
|
||||
#if defined(ARCH_X86) || defined (ARCH_X86_64)
|
||||
/* for small memory blocks (<256 bytes) this version is faster */
|
||||
@@ -251,7 +230,6 @@ void yuyv_plane_clear( size_t len, void *to )
|
||||
if( vj_task_available() ) {
|
||||
uint8_t * t = (uint8_t*) to;
|
||||
uint8_t *in[4] = { t, NULL,NULL,NULL };
|
||||
int strides[4] = { len, 0,0,0 };
|
||||
vj_task_run( in, in, NULL, NULL, 1, (performer_job_routine) &yuyv_plane_clear_job );
|
||||
}
|
||||
else {
|
||||
@@ -1349,10 +1327,10 @@ static void *memcpy_neon( void *to, const void *from, size_t n )
|
||||
|
||||
|
||||
static struct {
|
||||
char *name;
|
||||
void *(*function)(void *to, const void *from, size_t len);
|
||||
uint64_t time;
|
||||
uint32_t cpu_require;
|
||||
char *name;
|
||||
void *(*function)(void *to, const void *from, size_t len);
|
||||
double t;
|
||||
uint32_t cpu_require;
|
||||
} memcpy_method[] =
|
||||
{
|
||||
{ NULL, NULL, 0},
|
||||
@@ -1382,23 +1360,44 @@ static struct {
|
||||
#endif
|
||||
#ifdef HAVE_ARM_NEON
|
||||
{ "NEON optimized memcpy()", (void*) memcpy_neon, 0, AV_CPU_FLAG_NEON },
|
||||
#endif
|
||||
#ifdef HAVE_ARM
|
||||
{ "new mempcy for cortex with line size of 32, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_line_size_32_preload_192,0,0 },
|
||||
{ "new memcpy for cortex with line size of 64, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>" ,(void*) memcpy_new_line_size_64_preload_192, 0, 0 },
|
||||
{ "new memcpy for cortex with line size of 64, preload offset of 192, aligned access (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_line_size_64_preload_192_aligned_access, 0, 0 },
|
||||
{ "new memcpy for cortex with line size of 32, preload offset of 192, align 32", (void*) memcpy_new_line_size_32_preload_192_align_32,0,0},
|
||||
{ "new memcpy for cortex with line size of 32, preload offset of 96", (void*) memcpy_new_line_size_32_preload_96,0,0},
|
||||
{ "new memcpy for cortex with line size of 32, preload offset of 96, aligned access", (void*) memcpy_new_line_size_32_preload_96_aligned_access,0,0},
|
||||
#endif
|
||||
#ifdef HAVE_ARM_NEON
|
||||
{ "new memcpy for cortex using NEON with line size of 32, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_neon_line_size_32,0,AV_CPU_FLAG_NEON},
|
||||
{ "new memcpy for cortex using NEON with line size of 64, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_neon_line_size_64,0,AV_CPU_FLAG_NEON},
|
||||
{ "new mempcy for cortex using NEON with line size of 32, automatic prefetcher (C) Harm Hanemaaijer <fgenfb@yayhoo.com>", (void*) memcpy_new_neon_line_size_32_auto,0,AV_CPU_FLAG_NEON},
|
||||
#endif
|
||||
{ NULL, NULL, 0},
|
||||
};
|
||||
|
||||
static struct {
|
||||
char *name;
|
||||
void *(*function)(void *to, uint8_t c, size_t len);
|
||||
uint64_t time;
|
||||
uint32_t cpu_require;
|
||||
char *name;
|
||||
void *(*function)(void *to, uint8_t c, size_t len);
|
||||
uint32_t cpu_require;
|
||||
double t;
|
||||
} memset_method[] =
|
||||
{
|
||||
{ NULL, NULL, 0,0},
|
||||
{ "glibc memset()", (void*)memset, 0,0},
|
||||
{ NULL, NULL, 0,0},
|
||||
{ "glibc memset()",(void*)memset,0,0},
|
||||
#if defined(HAVE_ASM_MMX) || defined(HAVE_ASM_MMX2) || defined(HAVE_ASM_SSE)
|
||||
{ "MMX/MMX2/SSE optimized memset()", (void*) fast_memset, 0, AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2},
|
||||
{ "MMX/MMX2/SSE optimized memset()", (void*) fast_memset,0,AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2 },
|
||||
#endif
|
||||
{ NULL, NULL, 0,0},
|
||||
#ifdef HAVE_ARM_NEON
|
||||
{ "memset_neon (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_neon,0, AV_CPU_FLAG_NEON },
|
||||
#endif
|
||||
#ifdef HAVE_ARM
|
||||
{ "memset align 0 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_0,0,0 },
|
||||
{ "memset align 8 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_8,0,0 },
|
||||
{ "memset align 32 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_32,0,0 },
|
||||
#endif
|
||||
{ NULL, NULL, 0, 0},
|
||||
};
|
||||
|
||||
|
||||
@@ -1407,10 +1406,10 @@ void memcpy_report()
|
||||
int i;
|
||||
fprintf(stdout,"SIMD benchmark results:\n");
|
||||
for( i = 1; memset_method[i].name; i ++ ) {
|
||||
fprintf(stdout,"\t%8ld : %s\n",(long) memset_method[i].time, memset_method[i].name );
|
||||
fprintf(stdout,"\t%g : %s\n",memset_method[i].t, memset_method[i].name );
|
||||
}
|
||||
for( i = 1; memcpy_method[i].name; i ++ ) {
|
||||
fprintf(stdout,"\t%8ld : %s\n",(long) memcpy_method[i].time, memcpy_method[i].name );
|
||||
fprintf(stdout,"\t%g : %s\n",memcpy_method[i].t, memcpy_method[i].name );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1430,7 +1429,7 @@ char *get_memset_descr()
|
||||
|
||||
void find_best_memcpy()
|
||||
{
|
||||
uint64_t t;
|
||||
double t;
|
||||
char *buf1, *buf2;
|
||||
int i, best = 0,k;
|
||||
int bufsize = 720 * 576 * 3;
|
||||
@@ -1445,6 +1444,8 @@ void find_best_memcpy()
|
||||
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memcpy ..." );
|
||||
|
||||
memset(buf1,0, bufsize);
|
||||
memset(buf2,0, bufsize);
|
||||
|
||||
@@ -1454,28 +1455,28 @@ void find_best_memcpy()
|
||||
|
||||
for( i = 1; memcpy_method[i].name; i ++ ) {
|
||||
|
||||
t = rdtsc(cpu_flags);
|
||||
t = get_time();
|
||||
if( memcpy_method[i].cpu_require && !(cpu_flags & memcpy_method[i].cpu_require ) ) {
|
||||
memcpy_method[i].time = 0;
|
||||
memcpy_method[i].t = 0.0;
|
||||
continue;
|
||||
}
|
||||
|
||||
for( k = 0; k < 128; k ++ ) {
|
||||
memcpy_method[i].function( buf1,buf2, bufsize );
|
||||
}
|
||||
t = rdtsc(cpu_flags) - t;
|
||||
memcpy_method[i].time = t;
|
||||
t = get_time() - t;
|
||||
memcpy_method[i].t = t;
|
||||
}
|
||||
|
||||
for( i = 1; memcpy_method[i].name; i ++ ) {
|
||||
if(best == 0 ) {
|
||||
best = i;
|
||||
t = memcpy_method[i].time;
|
||||
t = memcpy_method[i].t;
|
||||
continue;
|
||||
}
|
||||
|
||||
if( memcpy_method[i].time < t && memcpy_method[i].time > 0 ) {
|
||||
t = memcpy_method[i].time;
|
||||
if( memcpy_method[i].t < t && memcpy_method[i].t > 0 ) {
|
||||
t = memcpy_method[i].t;
|
||||
best = i;
|
||||
}
|
||||
}
|
||||
@@ -1494,53 +1495,55 @@ void find_best_memcpy()
|
||||
|
||||
void find_best_memset()
|
||||
{
|
||||
uint64_t t;
|
||||
char *buf1, *buf2;
|
||||
int i, best = 0,k;
|
||||
double t;
|
||||
char *buf1, *buf2;
|
||||
int i, best = 0,k;
|
||||
int bufsize = 720 * 576 * 3;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (!(buf1 = (char*) malloc( bufsize * sizeof(char) )))
|
||||
return;
|
||||
if (!(buf1 = (char*) malloc( bufsize * sizeof(char) )))
|
||||
return;
|
||||
|
||||
if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) {
|
||||
free( buf1 );
|
||||
return;
|
||||
}
|
||||
|
||||
veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memset..." );
|
||||
|
||||
if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) {
|
||||
free( buf1 );
|
||||
return;
|
||||
}
|
||||
|
||||
memset( buf1, 0, bufsize * sizeof(char));
|
||||
memset( buf2, 0, bufsize * sizeof(char));
|
||||
|
||||
for (i=1; memset_method[i].name; i++)
|
||||
{
|
||||
for (i=1; memset_method[i].name; i++)
|
||||
{
|
||||
if( memset_method[i].cpu_require && !(cpu_flags & memset_method[i].cpu_require ) ) {
|
||||
memset_method[i].time= 0;
|
||||
memset_method[i].t= 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
t = rdtsc(cpu_flags);
|
||||
for( k = 0; k < 128; k ++ ) {
|
||||
memset_method[i].function( buf1 , 0 , bufsize );
|
||||
}
|
||||
t = rdtsc(cpu_flags) - t;
|
||||
|
||||
memset_method[i].time = t;
|
||||
t = get_time();
|
||||
for( k = 0; k < 128; k ++ ) {
|
||||
memset_method[i].function( buf1 , 0 , bufsize );
|
||||
}
|
||||
t = get_time() - t;
|
||||
|
||||
memset_method[i].t = t;
|
||||
|
||||
if (best == 0 || t < memset_method[best].time)
|
||||
best = i;
|
||||
}
|
||||
if (best == 0 || t < memset_method[best].t)
|
||||
best = i;
|
||||
}
|
||||
|
||||
if (best) {
|
||||
veejay_memset = memset_method[best].function;
|
||||
} else {
|
||||
veejay_memset = memset_method[1].function;
|
||||
}
|
||||
if (best) {
|
||||
veejay_memset = memset_method[best].function;
|
||||
}
|
||||
else {
|
||||
veejay_memset = memset_method[1].function;
|
||||
}
|
||||
|
||||
selected_best_memset = best;
|
||||
|
||||
free( buf1 );
|
||||
free( buf2 );
|
||||
|
||||
free( buf1 );
|
||||
free( buf2 );
|
||||
}
|
||||
|
||||
static void vj_frame_copy_job( void *arg ) {
|
||||
@@ -1721,122 +1724,118 @@ void vj_frame_clear1( uint8_t *input, unsigned int val, int size )
|
||||
vj_frame_clear( in, strides, val );
|
||||
}
|
||||
|
||||
static unsigned long benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
|
||||
static double benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
|
||||
{
|
||||
uint64_t k;
|
||||
uint64_t stats[c];
|
||||
int k;
|
||||
double stats[c];
|
||||
uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
|
||||
|
||||
for( k = 0; k < c; k ++ )
|
||||
{
|
||||
uint64_t t = rdtsc(0);
|
||||
double t = get_time();
|
||||
vj_frame_slow_single( source, source, dest, planes[0], planes[1]/2, 0.67f );
|
||||
t = rdtsc(0) - t;
|
||||
t = get_time() - t;
|
||||
stats[k] = t;
|
||||
}
|
||||
|
||||
uint64_t sum = 0;
|
||||
double sum = 0.0;
|
||||
for( k = 0; k < c ;k ++ )
|
||||
sum += stats[k];
|
||||
|
||||
uint64_t best_time = (sum / c );
|
||||
double best_time = (sum / c );
|
||||
|
||||
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
|
||||
(float)(bytes /1048576.0f), (best_time/1000.0f));
|
||||
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
|
||||
|
||||
return best_time;
|
||||
}
|
||||
|
||||
|
||||
static unsigned long benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
|
||||
static double benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
|
||||
{
|
||||
uint64_t k;
|
||||
uint64_t stats[c];
|
||||
int k;
|
||||
double stats[c];
|
||||
uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
|
||||
|
||||
for( k = 0; k < c; k ++ )
|
||||
{
|
||||
uint64_t t = rdtsc(0);
|
||||
uint64_t t = get_time();
|
||||
vj_frame_slow_threaded( source, source, dest, planes[0], planes[1]/2, 0.67f );
|
||||
t = rdtsc(0) - t;
|
||||
t = get_time() - t;
|
||||
stats[k] = t;
|
||||
}
|
||||
|
||||
uint64_t sum = 0;
|
||||
double sum = 0.0;
|
||||
for( k = 0; k < c ;k ++ )
|
||||
sum += stats[k];
|
||||
|
||||
uint64_t best_time = (sum / c );
|
||||
double best_time = (sum / c );
|
||||
|
||||
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
|
||||
(float)(bytes /1048576.0f), (best_time/1000.0f));
|
||||
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
|
||||
|
||||
return best_time;
|
||||
}
|
||||
|
||||
|
||||
static unsigned long benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes)
|
||||
static double benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes)
|
||||
{
|
||||
uint64_t k;
|
||||
uint64_t stats[c];
|
||||
int k;
|
||||
double stats[c];
|
||||
uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
|
||||
|
||||
for( k = 0; k < c; k ++ )
|
||||
{
|
||||
uint64_t t = rdtsc(0);
|
||||
double t = get_time();
|
||||
vj_frame_copyN( source,dest,planes );
|
||||
t = rdtsc(0) - t;
|
||||
t = get_time() - t;
|
||||
stats[k] = t;
|
||||
}
|
||||
|
||||
uint64_t sum = 0;
|
||||
double sum = 0.0;
|
||||
for( k = 0; k < c ;k ++ )
|
||||
sum += stats[k];
|
||||
|
||||
uint64_t best_time = (sum / c );
|
||||
double best_time = (sum / c );
|
||||
|
||||
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
|
||||
(float)(bytes /1048576.0f), (best_time/1000.0f));
|
||||
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
|
||||
|
||||
return best_time;
|
||||
}
|
||||
|
||||
static unsigned long benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes)
|
||||
static double benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes)
|
||||
{
|
||||
uint64_t k; int j;
|
||||
uint64_t stats[c];
|
||||
int k; int j;
|
||||
double stats[c];
|
||||
uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
|
||||
|
||||
for( k = 0; k < c; k ++ ) {
|
||||
uint64_t t = rdtsc(0);
|
||||
double t = get_time();
|
||||
for( j = 0; j < 4; j ++ ) {
|
||||
veejay_memcpy( dest[j], source[j], planes[j] );
|
||||
}
|
||||
t = rdtsc(0) - t;
|
||||
t = get_time() - t;
|
||||
stats[k] = t;
|
||||
}
|
||||
|
||||
uint64_t sum = 0;
|
||||
double sum = 0.0;
|
||||
for( k = 0; k < c; k ++ )
|
||||
sum += stats[k];
|
||||
|
||||
uint64_t best_time = (sum/c);
|
||||
double best_time = (sum/c);
|
||||
|
||||
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
|
||||
(float)(bytes /1048576.0f), (best_time/1000.0f));
|
||||
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
|
||||
|
||||
return best_time;
|
||||
}
|
||||
|
||||
typedef unsigned long (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes);
|
||||
typedef double (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes);
|
||||
|
||||
|
||||
void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames, uint8_t **dest, uint8_t **source, int *planes )
|
||||
{
|
||||
uint32_t N = 8;
|
||||
uint64_t stats[N];
|
||||
int N = 8;
|
||||
double stats[N];
|
||||
uint32_t i;
|
||||
uint64_t fastest = 0;
|
||||
double fastest = 0.0;
|
||||
float work_size = (planes[0] + planes[1] + planes[2] + planes[3]) / 1048576.0f;
|
||||
|
||||
veejay_msg(VEEJAY_MSG_INFO, "run test '%s' (%dx) on chunks of %2.2f MB:", str, N, work_size );
|
||||
@@ -1848,8 +1847,8 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames,
|
||||
fastest = stats[i];
|
||||
}
|
||||
|
||||
uint64_t sum = 0;
|
||||
uint64_t slowest=fastest;
|
||||
double sum = 0.0;
|
||||
double slowest=fastest;
|
||||
for( i = 0; i < N; i ++ )
|
||||
{
|
||||
if( stats[i] < fastest ) {
|
||||
@@ -1860,8 +1859,7 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames,
|
||||
|
||||
float average = (sum / N);
|
||||
|
||||
veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %2.4f ms, worst is %2.4f ms, average is %2.4f ms",
|
||||
str, fastest/1000.0f, slowest/1000.0f, average/1000.0f );
|
||||
veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %g, worst is %g, average is %g",str, fastest, slowest, average );
|
||||
}
|
||||
|
||||
void benchmark_tasks(int n_tasks, long n_frames, int w, int h)
|
||||
@@ -1914,16 +1912,19 @@ void benchmark_veejay(int w, int h)
|
||||
if( h < 64)
|
||||
h = 64;
|
||||
|
||||
veejay_msg(VEEJAY_MSG_INFO, "Starting benchmark %dx%d YUVP 4:2:2 (100 frames)", w,h);
|
||||
|
||||
int n_tasks = task_num_cpus();
|
||||
init_parallel_tasks( n_tasks );
|
||||
char *str2 = getenv( "VEEJAY_MULTITHREAD_TASKS" );
|
||||
if( str2 ) {
|
||||
n_tasks = atoi(str2);
|
||||
}
|
||||
|
||||
int n_frames = 100;
|
||||
veejay_msg(VEEJAY_MSG_INFO, "Benchmark %dx%d YUVP 4:2:2 (%d frames)", w,h,n_frames);
|
||||
benchmark_tasks( n_tasks, n_frames,w,h );
|
||||
|
||||
veejay_msg(VEEJAY_MSG_INFO, "VEEJAY_MULTITHREAD_TASKS=%d", n_tasks );
|
||||
|
||||
init_parallel_tasks( n_tasks );
|
||||
|
||||
benchmark_tasks( n_tasks,100,w,h );
|
||||
}
|
||||
|
||||
void *vj_hmalloc(size_t sze, const char *name)
|
||||
|
||||
@@ -183,10 +183,6 @@ int vj_mem_threaded_init(int w, int h)
|
||||
num_tasks = n_cpus;
|
||||
if( num_tasks < 1 )
|
||||
num_tasks = 1;
|
||||
|
||||
if( num_tasks > 1 )
|
||||
veejay_msg( VEEJAY_MSG_INFO, "Using %d threads scheduled over %d cpus in performer.", num_tasks, n_cpus-1 );
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -3,3 +3,8 @@ SUBDIRS = aclib bio2jack libhash liblzo libOSC libresample
|
||||
if !HAVE_MJPEGTOOLS
|
||||
SUBDIRS += mjpegtools
|
||||
endif
|
||||
|
||||
if HAVE_ARM
|
||||
SUBDIRS += fastarm
|
||||
endif
|
||||
|
||||
|
||||
10
veejay-current/veejay-server/thirdparty/fastarm/Makefile.am
vendored
Normal file
10
veejay-current/veejay-server/thirdparty/fastarm/Makefile.am
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
# Makefile for veejay
|
||||
MAINTAINERCLEANFILES = Makefile.in
|
||||
AM_CFLAGS = $(FASTARM_CFLAGS)
|
||||
AM_CPPFLAGS = -I$(top_srcdir) -I$(includedir) \
|
||||
-I$(top_srcdir)/thirdparty $(FASTARM_CFLAGS)
|
||||
|
||||
FASTARM_LIB_FILE = libfastarm.la
|
||||
noinst_LTLIBRARIES = $(FASTARM_LIB_FILE)
|
||||
libfastarm_la_SOURCES = new_arm.S
|
||||
EXTRA_DIST=
|
||||
97
veejay-current/veejay-server/thirdparty/fastarm/README
vendored
Normal file
97
veejay-current/veejay-server/thirdparty/fastarm/README
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
fastarm
|
||||
|
||||
This toolkit contains a set of fast memcpy/memset variants for ARM
|
||||
platforms. They either use the standard register file, or optionally
|
||||
NEON instructions,
|
||||
|
||||
Several basic families of variants are provided; the current ones are
|
||||
the "new memcpy" variants which are the default for memcpy replacement,
|
||||
which generally do not overfetch beyond the source region and can be
|
||||
configured to use unaligned memory access for small sizes, or to use
|
||||
strictly aligned memory access. This family can also be configured to
|
||||
include a fast path for smaller sizes (this is the default), disabling
|
||||
this results in smaller code size at the expense of worse performance
|
||||
for small sizes. NEON optimized versions, which are generally faster
|
||||
with reduced code size, are also provided.
|
||||
|
||||
To compile the benchmark program, run 'make'. This will compile in a
|
||||
plethora of variants with different preload strategies, block sizes,
|
||||
alignment etc.
|
||||
|
||||
A benchmark program to compare various memcpy variants is provided. Try
|
||||
something like "./benchmark --memcpy ad --all". (Use --memcpy al on the
|
||||
Raspberry Pi platform).
|
||||
|
||||
To compile a memcpy replacement library, set PLATFORM to one of the
|
||||
values described at the beginning of the Makefile. This selects the
|
||||
cache line size to use and whether to use NEON versions.
|
||||
|
||||
Optionally disable Thumb2 mode compilation by commenting out the THUMBFLAGS
|
||||
definition. It must be disabled on the Raspberry Pi.
|
||||
|
||||
Then run:
|
||||
|
||||
sudo make install_memcpy_replacement
|
||||
|
||||
The replacement memcpy/memset shared library will be installed into
|
||||
/usr/lib/arm-linux-gnueabihf/ as libfastarm.so.
|
||||
|
||||
To enable the use of the replacement memcpy in applications, create or edit
|
||||
the file /etc/ld.so.preload so that it contains the line:
|
||||
|
||||
/usr/lib/arm-linux-gnueabihf/libfastarm.so
|
||||
|
||||
On the RPi platform, references to libcofi_rpi.so should be commented out
|
||||
or deleted. The new memcpy should now be activated for newly launched
|
||||
programs. To be sure, reboot or run:
|
||||
|
||||
sudo ldconfig
|
||||
|
||||
To revert to the default optimized memcpy on the RPi platform,
|
||||
edit /etc/ld.so.preload so that it contains the line:
|
||||
|
||||
/usr/lib/arm-linux-gnueabihf/libcofi_rpi.so
|
||||
|
||||
instead of the one using libfastarm.so.
|
||||
|
||||
Note on cache line size:
|
||||
|
||||
Although assuming a preload line size 64 bytes is a little faster on several
|
||||
Cortex platforms for small to moderate sizes, when accessing DRAM
|
||||
with larger sizes assuming 32 byte preloads seems to be faster. On earlier
|
||||
Cortex A9 models, 32 byte preloads are required for good performance in all
|
||||
cases.
|
||||
|
||||
Notes on performance with and without NEON:
|
||||
|
||||
For NEON-based memcpy, a significant benefit is seen on the tested Cortex A8
|
||||
platform for unaligned copies in cache memory and for aligned and unaligned
|
||||
copies in DRAM. Performance for aligned copies in cache memory is relatively
|
||||
similar to the optimized non-NEON function.
|
||||
|
||||
Results in MB/s on a Cortex A8, with Thumb2 mode enabled, of
|
||||
standard libc (Debian unstable), armv7 and NEON optimized memcpy
|
||||
variants with line size of 32 bytes:
|
||||
|
||||
libc armv7 NEON
|
||||
test 0 522 549 567
|
||||
test 1 329 377 378
|
||||
test 2 434 430 513
|
||||
test 28 351 361 458
|
||||
test 29 246 248 358
|
||||
test 43 467 512 581
|
||||
|
||||
Test 0 in the benchmark program tests word-aligned requests with
|
||||
sizes that are a power of 2 up to 4096 bytes distributed according
|
||||
to a power law.
|
||||
Test 1 in the benchmark program tests word-aligned requests with
|
||||
sizes up to 1024 that are a multiple of 4, distributed according
|
||||
to a power law.
|
||||
Test 2 in the benchmark program tests unaligned requests with sizes
|
||||
up to 1023 bytes.
|
||||
Test 28 in the benchmark program tests word aligned requests in DRAM
|
||||
with sizes up to 1024 bytes.
|
||||
Test 29 in the benchmark program tests word aligned requests in DRAM
|
||||
with sizes up to 256 bytes.
|
||||
Test 43 in the benchmark program tests page aligned requests in DRAM
|
||||
of size 4096 (copying a memory page).
|
||||
1858
veejay-current/veejay-server/thirdparty/fastarm/new_arm.S
vendored
Normal file
1858
veejay-current/veejay-server/thirdparty/fastarm/new_arm.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
35
veejay-current/veejay-server/thirdparty/fastarm/new_arm.h
vendored
Normal file
35
veejay-current/veejay-server/thirdparty/fastarm/new_arm.h
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
|
||||
extern void *memcpy_new_line_size_64_preload_192(void *dest,
|
||||
const void *src, size_t n);
|
||||
|
||||
extern void *memcpy_new_line_size_64_preload_192_align_32(void *dest,
|
||||
const void *src, size_t n);
|
||||
|
||||
extern void *memcpy_new_line_size_64_preload_192_aligned_access(void *dest,
|
||||
const void *src, size_t n);
|
||||
|
||||
extern void *memcpy_new_line_size_32_preload_192(void *dest,
|
||||
const void *src, size_t n);
|
||||
|
||||
extern void *memcpy_new_line_size_32_preload_192_align_32(void *dest,
|
||||
const void *src, size_t n);
|
||||
|
||||
extern void *memcpy_new_line_size_32_preload_96(void *dest,
|
||||
const void *src, size_t n);
|
||||
|
||||
extern void *memcpy_new_line_size_32_preload_96_aligned_access(void *dest,
|
||||
const void *src, size_t n);
|
||||
|
||||
extern void *memcpy_new_neon_line_size_64(void *dest, const void *src, size_t n);
|
||||
|
||||
extern void *memcpy_new_neon_line_size_32(void *dest, const void *src, size_t n);
|
||||
|
||||
extern void *memcpy_new_neon_line_size_32_auto(void *dest, const void *src, size_t n);
|
||||
|
||||
extern void *memset_new_align_0(void *dest, int c, size_t size);
|
||||
|
||||
extern void *memset_new_align_8(void *dest, int c, size_t size);
|
||||
|
||||
extern void *memset_new_align_32(void *dest, int c, size_t size);
|
||||
|
||||
extern void *memset_neon(void *dest, int c, size_t size);
|
||||
@@ -66,6 +66,10 @@ if !HAVE_MJPEGTOOLS
|
||||
libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/mjpegtools -lmjpegutils
|
||||
endif
|
||||
|
||||
if HAVE_ARM
|
||||
libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/fastarm -lfastarm
|
||||
endif
|
||||
|
||||
libveejay_la_LDFLAGS += $(SDL_LIBS) $(SDL_TTF_LIBS) $(DIRECTFB_LIBS) $(X_LIBS) $(PTHREAD_LIBS) $(FT_LDFLAGS) $(FT_LIBS) \
|
||||
$(XML2_LIBS) $(JPEG_LIBS) $(LIBLO_LIBS) $(LIBUNWIND_LIBS) $(GLIB_LIBS) \
|
||||
$(FFMPEG_LIBS) $(XINERAMA_LIBS) $(MJPEGTOOLS_LIBS) $(LIBPNG_LIBS) \
|
||||
|
||||
@@ -788,7 +788,7 @@ int main(int argc, char **argv)
|
||||
{
|
||||
veejay_free(info);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
print_license();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user