add fastarm memcpy

This commit is contained in:
c0ntrol
2016-02-13 18:53:36 +01:00
parent 61f7de936c
commit 1eb4ff18eb
10 changed files with 2172 additions and 160 deletions

View File

@@ -376,6 +376,27 @@ esac
CFLAGS="$CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES"
if test x$host_alias != x; then
dnl Cross compiling
AC_MSG_CHECKING(sub-architecture settings)
if test x$have_x86cpu = xtrue; then
host_mod_cpu=`echo $host_cpu|tr _ -`
ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu"
AC_MSG_RESULT($ARCHFLAGS)
fi
else
AC_MSG_CHECKING(sub-architecture settings)
chmod +x $srcdir/cpuinfo.sh
if test "$arch_target" = "auto"; then
TMP=`$srcdir/cpuinfo.sh`
ARCHFLAGS=`cat veejay.arch`
else
ARCHFLAGS="-mtune=generic"
fi
AC_MSG_RESULT($ARCHFLAGS)
fi
dnl ARM architecture detect NEON and set CFLAGS
if test x$have_arm = xtrue
@@ -392,8 +413,11 @@ then
if test $ac_cv_flag_neon = yes ; then
AC_DEFINE(HAVE_ARM_NEON,1,[Compiling in NEON support])
USER_CFLAGS="-mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad $USER_CFLAGS"
FASTARM_CFLAGS="$ARCHFLAGS -Wa,-march=armv7-a -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB"
else
USER_CFLAGS="-march=native -ftree-vectorize $USER_CFLAGS"
FASTARM_CFLAGS="$ARCHFLAGS -Wa, -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB"
fi
if test "x$enable_debug" != "xyes" ; then
@@ -406,6 +430,8 @@ then
SUBSAMPLE_CFLAGS="$USER_CFLAGS"
VJE_CFLAGS="$USER_CFLAGS"
CFLAGS="$USER_CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES"
AC_SUBST(FASTARM_CFLAGS)
fi
dnl This flag is used for PROGRAMS not SHARED LIBRARIES. PIC code is required
@@ -584,28 +610,6 @@ EOF
fi
fi
if test x$host_alias != x; then
dnl Cross compiling
AC_MSG_CHECKING(sub-architecture settings)
if test x$have_x86cpu = xtrue; then
host_mod_cpu=`echo $host_cpu|tr _ -`
ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu"
AC_MSG_RESULT($ARCHFLAGS)
fi
else
AC_MSG_CHECKING(sub-architecture settings)
chmod +x $srcdir/cpuinfo.sh
if test "$arch_target" = "auto"; then
TMP=`$srcdir/cpuinfo.sh`
ARCHFLAGS=`cat veejay.arch`
else
ARCHFLAGS="-mtune=generic"
fi
AC_MSG_RESULT($ARCHFLAGS)
fi
have_mjpegtools=false
AC_SUBST(MJPEGTOOLS_CFLAGS)
AC_SUBST(MJPGETOOLS_LIBS)
@@ -1074,6 +1078,7 @@ AM_CONDITIONAL(HAVE_JPEG,test x$have_jpeg = xtrue)
AM_CONDITIONAL(HAVE_LIBLO,test x$have_liblo = xtrue)
AM_CONDITIONAL(HAVE_FREETYPE2, test x$have_freetype2 = xtrue)
AM_CONDITIONAL(HAVE_MJPEGTOOLS, test x$have_mjpegtools = xtrue )
AM_CONDITIONAL(HAVE_ARM, test x$have_arm = xtrue )
dnl *********************************************************************
dnl Check for what warnings we want gcc to use and adjust the CFLAGS
dnl as needed. This only works for GCC.
@@ -1161,6 +1166,7 @@ fi
AC_CONFIG_FILES([
thirdparty/Makefile
thirdparty/fastarm/Makefile
thirdparty/aclib/Makefile
thirdparty/bio2jack/Makefile
thirdparty/libhash/Makefile

View File

@@ -141,6 +141,10 @@
#include <libvje/vje.h>
#include <veejay/vj-task.h>
#include <libavutil/cpu.h>
#ifdef HAVE_ARM
#include <fastarm/new_arm.h>
#endif
#define BUFSIZE 1024
@@ -157,37 +161,12 @@
static int selected_best_memcpy = 1;
static int selected_best_memset = 1;
#ifdef HAVE_POSIX_TIMERS
static int64_t _x_gettime(void)
static double get_time()
{
struct timespec tm;
return (clock_gettime(CLOCK_THREAD_CPUTIME_ID,&tm) == -1 )
? times(NULL)
: (int64_t) tm.tv_sec * 1e9 + tm.tv_nsec;
struct timespec ts;
clock_gettime( CLOCK_REALTIME, &ts );
return (double) ts.tv_sec + (double) ts.tv_nsec / 1000000000.0;
}
#define rdtsc(x) _x_gettime()
#elif (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(HAVE_SYS_TIMES_H)
static int64_t rdtsc(int cpu_flags)
{
int64_t x;
if( cpu_flags & AV_CPU_FLAGS_MMX ) {
__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
return x;
} else {
return times(NULL);
}
}
#else
static uint64_t rdtsc(int cpu_flags)
{
#ifdef HAVE_SYS_TIMES_H
struct tms tp;
return times(&tp);
#else
return clock();
#endif
}
#endif /* HAVE_SYS_TIMES_H */
#if defined(ARCH_X86) || defined (ARCH_X86_64)
/* for small memory blocks (<256 bytes) this version is faster */
@@ -251,7 +230,6 @@ void yuyv_plane_clear( size_t len, void *to )
if( vj_task_available() ) {
uint8_t * t = (uint8_t*) to;
uint8_t *in[4] = { t, NULL,NULL,NULL };
int strides[4] = { len, 0,0,0 };
vj_task_run( in, in, NULL, NULL, 1, (performer_job_routine) &yuyv_plane_clear_job );
}
else {
@@ -1349,10 +1327,10 @@ static void *memcpy_neon( void *to, const void *from, size_t n )
static struct {
char *name;
void *(*function)(void *to, const void *from, size_t len);
uint64_t time;
uint32_t cpu_require;
char *name;
void *(*function)(void *to, const void *from, size_t len);
double t;
uint32_t cpu_require;
} memcpy_method[] =
{
{ NULL, NULL, 0},
@@ -1382,23 +1360,44 @@ static struct {
#endif
#ifdef HAVE_ARM_NEON
{ "NEON optimized memcpy()", (void*) memcpy_neon, 0, AV_CPU_FLAG_NEON },
#endif
#ifdef HAVE_ARM
{ "new mempcy for cortex with line size of 32, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_line_size_32_preload_192,0,0 },
{ "new memcpy for cortex with line size of 64, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>" ,(void*) memcpy_new_line_size_64_preload_192, 0, 0 },
{ "new memcpy for cortex with line size of 64, preload offset of 192, aligned access (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_line_size_64_preload_192_aligned_access, 0, 0 },
{ "new memcpy for cortex with line size of 32, preload offset of 192, align 32", (void*) memcpy_new_line_size_32_preload_192_align_32,0,0},
{ "new memcpy for cortex with line size of 32, preload offset of 96", (void*) memcpy_new_line_size_32_preload_96,0,0},
{ "new memcpy for cortex with line size of 32, preload offset of 96, aligned access", (void*) memcpy_new_line_size_32_preload_96_aligned_access,0,0},
#endif
#ifdef HAVE_ARM_NEON
{ "new memcpy for cortex using NEON with line size of 32, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_neon_line_size_32,0,AV_CPU_FLAG_NEON},
{ "new memcpy for cortex using NEON with line size of 64, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_neon_line_size_64,0,AV_CPU_FLAG_NEON},
{ "new mempcy for cortex using NEON with line size of 32, automatic prefetcher (C) Harm Hanemaaijer <fgenfb@yayhoo.com>", (void*) memcpy_new_neon_line_size_32_auto,0,AV_CPU_FLAG_NEON},
#endif
{ NULL, NULL, 0},
};
static struct {
char *name;
void *(*function)(void *to, uint8_t c, size_t len);
uint64_t time;
uint32_t cpu_require;
char *name;
void *(*function)(void *to, uint8_t c, size_t len);
uint32_t cpu_require;
double t;
} memset_method[] =
{
{ NULL, NULL, 0,0},
{ "glibc memset()", (void*)memset, 0,0},
{ NULL, NULL, 0,0},
{ "glibc memset()",(void*)memset,0,0},
#if defined(HAVE_ASM_MMX) || defined(HAVE_ASM_MMX2) || defined(HAVE_ASM_SSE)
{ "MMX/MMX2/SSE optimized memset()", (void*) fast_memset, 0, AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2},
{ "MMX/MMX2/SSE optimized memset()", (void*) fast_memset,0,AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2 },
#endif
{ NULL, NULL, 0,0},
#ifdef HAVE_ARM_NEON
{ "memset_neon (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_neon,0, AV_CPU_FLAG_NEON },
#endif
#ifdef HAVE_ARM
{ "memset align 0 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_0,0,0 },
{ "memset align 8 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_8,0,0 },
{ "memset align 32 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_32,0,0 },
#endif
{ NULL, NULL, 0, 0},
};
@@ -1407,10 +1406,10 @@ void memcpy_report()
int i;
fprintf(stdout,"SIMD benchmark results:\n");
for( i = 1; memset_method[i].name; i ++ ) {
fprintf(stdout,"\t%8ld : %s\n",(long) memset_method[i].time, memset_method[i].name );
fprintf(stdout,"\t%g : %s\n",memset_method[i].t, memset_method[i].name );
}
for( i = 1; memcpy_method[i].name; i ++ ) {
fprintf(stdout,"\t%8ld : %s\n",(long) memcpy_method[i].time, memcpy_method[i].name );
fprintf(stdout,"\t%g : %s\n",memcpy_method[i].t, memcpy_method[i].name );
}
}
@@ -1430,7 +1429,7 @@ char *get_memset_descr()
void find_best_memcpy()
{
uint64_t t;
double t;
char *buf1, *buf2;
int i, best = 0,k;
int bufsize = 720 * 576 * 3;
@@ -1445,6 +1444,8 @@ void find_best_memcpy()
int cpu_flags = av_get_cpu_flags();
veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memcpy ..." );
memset(buf1,0, bufsize);
memset(buf2,0, bufsize);
@@ -1454,28 +1455,28 @@ void find_best_memcpy()
for( i = 1; memcpy_method[i].name; i ++ ) {
t = rdtsc(cpu_flags);
t = get_time();
if( memcpy_method[i].cpu_require && !(cpu_flags & memcpy_method[i].cpu_require ) ) {
memcpy_method[i].time = 0;
memcpy_method[i].t = 0.0;
continue;
}
for( k = 0; k < 128; k ++ ) {
memcpy_method[i].function( buf1,buf2, bufsize );
}
t = rdtsc(cpu_flags) - t;
memcpy_method[i].time = t;
t = get_time() - t;
memcpy_method[i].t = t;
}
for( i = 1; memcpy_method[i].name; i ++ ) {
if(best == 0 ) {
best = i;
t = memcpy_method[i].time;
t = memcpy_method[i].t;
continue;
}
if( memcpy_method[i].time < t && memcpy_method[i].time > 0 ) {
t = memcpy_method[i].time;
if( memcpy_method[i].t < t && memcpy_method[i].t > 0 ) {
t = memcpy_method[i].t;
best = i;
}
}
@@ -1494,53 +1495,55 @@ void find_best_memcpy()
void find_best_memset()
{
uint64_t t;
char *buf1, *buf2;
int i, best = 0,k;
double t;
char *buf1, *buf2;
int i, best = 0,k;
int bufsize = 720 * 576 * 3;
int cpu_flags = av_get_cpu_flags();
if (!(buf1 = (char*) malloc( bufsize * sizeof(char) )))
return;
if (!(buf1 = (char*) malloc( bufsize * sizeof(char) )))
return;
if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) {
free( buf1 );
return;
}
veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memset..." );
if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) {
free( buf1 );
return;
}
memset( buf1, 0, bufsize * sizeof(char));
memset( buf2, 0, bufsize * sizeof(char));
for (i=1; memset_method[i].name; i++)
{
for (i=1; memset_method[i].name; i++)
{
if( memset_method[i].cpu_require && !(cpu_flags & memset_method[i].cpu_require ) ) {
memset_method[i].time= 0;
memset_method[i].t= 0;
continue;
}
}
t = rdtsc(cpu_flags);
for( k = 0; k < 128; k ++ ) {
memset_method[i].function( buf1 , 0 , bufsize );
}
t = rdtsc(cpu_flags) - t;
memset_method[i].time = t;
t = get_time();
for( k = 0; k < 128; k ++ ) {
memset_method[i].function( buf1 , 0 , bufsize );
}
t = get_time() - t;
memset_method[i].t = t;
if (best == 0 || t < memset_method[best].time)
best = i;
}
if (best == 0 || t < memset_method[best].t)
best = i;
}
if (best) {
veejay_memset = memset_method[best].function;
} else {
veejay_memset = memset_method[1].function;
}
if (best) {
veejay_memset = memset_method[best].function;
}
else {
veejay_memset = memset_method[1].function;
}
selected_best_memset = best;
free( buf1 );
free( buf2 );
free( buf1 );
free( buf2 );
}
static void vj_frame_copy_job( void *arg ) {
@@ -1721,122 +1724,118 @@ void vj_frame_clear1( uint8_t *input, unsigned int val, int size )
vj_frame_clear( in, strides, val );
}
static unsigned long benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
static double benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
{
uint64_t k;
uint64_t stats[c];
int k;
double stats[c];
uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
for( k = 0; k < c; k ++ )
{
uint64_t t = rdtsc(0);
double t = get_time();
vj_frame_slow_single( source, source, dest, planes[0], planes[1]/2, 0.67f );
t = rdtsc(0) - t;
t = get_time() - t;
stats[k] = t;
}
uint64_t sum = 0;
double sum = 0.0;
for( k = 0; k < c ;k ++ )
sum += stats[k];
uint64_t best_time = (sum / c );
double best_time = (sum / c );
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
(float)(bytes /1048576.0f), (best_time/1000.0f));
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
return best_time;
}
static unsigned long benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
static double benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
{
uint64_t k;
uint64_t stats[c];
int k;
double stats[c];
uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
for( k = 0; k < c; k ++ )
{
uint64_t t = rdtsc(0);
uint64_t t = get_time();
vj_frame_slow_threaded( source, source, dest, planes[0], planes[1]/2, 0.67f );
t = rdtsc(0) - t;
t = get_time() - t;
stats[k] = t;
}
uint64_t sum = 0;
double sum = 0.0;
for( k = 0; k < c ;k ++ )
sum += stats[k];
uint64_t best_time = (sum / c );
double best_time = (sum / c );
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
(float)(bytes /1048576.0f), (best_time/1000.0f));
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
return best_time;
}
static unsigned long benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes)
static double benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes)
{
uint64_t k;
uint64_t stats[c];
int k;
double stats[c];
uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
for( k = 0; k < c; k ++ )
{
uint64_t t = rdtsc(0);
double t = get_time();
vj_frame_copyN( source,dest,planes );
t = rdtsc(0) - t;
t = get_time() - t;
stats[k] = t;
}
uint64_t sum = 0;
double sum = 0.0;
for( k = 0; k < c ;k ++ )
sum += stats[k];
uint64_t best_time = (sum / c );
double best_time = (sum / c );
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
(float)(bytes /1048576.0f), (best_time/1000.0f));
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
return best_time;
}
static unsigned long benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes)
static double benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes)
{
uint64_t k; int j;
uint64_t stats[c];
int k; int j;
double stats[c];
uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
for( k = 0; k < c; k ++ ) {
uint64_t t = rdtsc(0);
double t = get_time();
for( j = 0; j < 4; j ++ ) {
veejay_memcpy( dest[j], source[j], planes[j] );
}
t = rdtsc(0) - t;
t = get_time() - t;
stats[k] = t;
}
uint64_t sum = 0;
double sum = 0.0;
for( k = 0; k < c; k ++ )
sum += stats[k];
uint64_t best_time = (sum/c);
double best_time = (sum/c);
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
(float)(bytes /1048576.0f), (best_time/1000.0f));
veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
return best_time;
}
typedef unsigned long (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes);
typedef double (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes);
void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames, uint8_t **dest, uint8_t **source, int *planes )
{
uint32_t N = 8;
uint64_t stats[N];
int N = 8;
double stats[N];
uint32_t i;
uint64_t fastest = 0;
double fastest = 0.0;
float work_size = (planes[0] + planes[1] + planes[2] + planes[3]) / 1048576.0f;
veejay_msg(VEEJAY_MSG_INFO, "run test '%s' (%dx) on chunks of %2.2f MB:", str, N, work_size );
@@ -1848,8 +1847,8 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames,
fastest = stats[i];
}
uint64_t sum = 0;
uint64_t slowest=fastest;
double sum = 0.0;
double slowest=fastest;
for( i = 0; i < N; i ++ )
{
if( stats[i] < fastest ) {
@@ -1860,8 +1859,7 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames,
float average = (sum / N);
veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %2.4f ms, worst is %2.4f ms, average is %2.4f ms",
str, fastest/1000.0f, slowest/1000.0f, average/1000.0f );
veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %g, worst is %g, average is %g",str, fastest, slowest, average );
}
void benchmark_tasks(int n_tasks, long n_frames, int w, int h)
@@ -1914,16 +1912,19 @@ void benchmark_veejay(int w, int h)
if( h < 64)
h = 64;
veejay_msg(VEEJAY_MSG_INFO, "Starting benchmark %dx%d YUVP 4:2:2 (100 frames)", w,h);
int n_tasks = task_num_cpus();
init_parallel_tasks( n_tasks );
char *str2 = getenv( "VEEJAY_MULTITHREAD_TASKS" );
if( str2 ) {
n_tasks = atoi(str2);
}
int n_frames = 100;
veejay_msg(VEEJAY_MSG_INFO, "Benchmark %dx%d YUVP 4:2:2 (%d frames)", w,h,n_frames);
benchmark_tasks( n_tasks, n_frames,w,h );
veejay_msg(VEEJAY_MSG_INFO, "VEEJAY_MULTITHREAD_TASKS=%d", n_tasks );
init_parallel_tasks( n_tasks );
benchmark_tasks( n_tasks,100,w,h );
}
void *vj_hmalloc(size_t sze, const char *name)

View File

@@ -183,10 +183,6 @@ int vj_mem_threaded_init(int w, int h)
num_tasks = n_cpus;
if( num_tasks < 1 )
num_tasks = 1;
if( num_tasks > 1 )
veejay_msg( VEEJAY_MSG_INFO, "Using %d threads scheduled over %d cpus in performer.", num_tasks, n_cpus-1 );
}
}

View File

@@ -3,3 +3,8 @@ SUBDIRS = aclib bio2jack libhash liblzo libOSC libresample
if !HAVE_MJPEGTOOLS
SUBDIRS += mjpegtools
endif
if HAVE_ARM
SUBDIRS += fastarm
endif

View File

@@ -0,0 +1,10 @@
# Makefile for veejay
MAINTAINERCLEANFILES = Makefile.in
AM_CFLAGS = $(FASTARM_CFLAGS)
AM_CPPFLAGS = -I$(top_srcdir) -I$(includedir) \
-I$(top_srcdir)/thirdparty $(FASTARM_CFLAGS)
FASTARM_LIB_FILE = libfastarm.la
noinst_LTLIBRARIES = $(FASTARM_LIB_FILE)
libfastarm_la_SOURCES = new_arm.S
EXTRA_DIST=

View File

@@ -0,0 +1,97 @@
fastarm
This toolkit contains a set of fast memcpy/memset variants for ARM
platforms. They either use the standard register file, or optionally
NEON instructions,
Several basic families of variants are provided; the current ones are
the "new memcpy" variants which are the default for memcpy replacement,
which generally do not overfetch beyond the source region and can be
configured to use unaligned memory access for small sizes, or to use
strictly aligned memory access. This family can also be configured to
include a fast path for smaller sizes (this is the default), disabling
this results in smaller code size at the expense of worse performance
for small sizes. NEON optimized versions, which are generally faster
with reduced code size, are also provided.
To compile the benchmark program, run 'make'. This will compile in a
plethora of variants with different preload strategies, block sizes,
alignment etc.
A benchmark program to compare various memcpy variants is provided. Try
something like "./benchmark --memcpy ad --all". (Use --memcpy al on the
Raspberry Pi platform).
To compile a memcpy replacement library, set PLATFORM to one of the
values described at the beginning of the Makefile. This selects the
cache line size to use and whether to use NEON versions.
Optionally disable Thumb2 mode compilation by commenting out the THUMBFLAGS
definition. It must be disabled on the Raspberry Pi.
Then run:
sudo make install_memcpy_replacement
The replacement memcpy/memset shared library will be installed into
/usr/lib/arm-linux-gnueabihf/ as libfastarm.so.
To enable the use of the replacement memcpy in applications, create or edit
the file /etc/ld.so.preload so that it contains the line:
/usr/lib/arm-linux-gnueabihf/libfastarm.so
On the RPi platform, references to libcofi_rpi.so should be commented out
or deleted. The new memcpy should now be activated for newly launched
programs. To be sure, reboot or run:
sudo ldconfig
To revert to the default optimized memcpy on the RPi platform,
edit /etc/ld.so.preload so that it contains the line:
/usr/lib/arm-linux-gnueabihf/libcofi_rpi.so
instead of the one using libfastarm.so.
Note on cache line size:
Although assuming a preload line size 64 bytes is a little faster on several
Cortex platforms for small to moderate sizes, when accessing DRAM
with larger sizes assuming 32 byte preloads seems to be faster. On earlier
Cortex A9 models, 32 byte preloads are required for good performance in all
cases.
Notes on performance with and without NEON:
For NEON-based memcpy, a significant benefit is seen on the tested Cortex A8
platform for unaligned copies in cache memory and for aligned and unaligned
copies in DRAM. Performance for aligned copies in cache memory is relatively
similar to the optimized non-NEON function.
Results in MB/s on a Cortex A8, with Thumb2 mode enabled, of
standard libc (Debian unstable), armv7 and NEON optimized memcpy
variants with line size of 32 bytes:
libc armv7 NEON
test 0 522 549 567
test 1 329 377 378
test 2 434 430 513
test 28 351 361 458
test 29 246 248 358
test 43 467 512 581
Test 0 in the benchmark program tests word-aligned requests with
sizes that are a power of 2 up to 4096 bytes distributed according
to a power law.
Test 1 in the benchmark program tests word-aligned requests with
sizes up to 1024 that are a multiple of 4, distributed according
to a power law.
Test 2 in the benchmark program tests unaligned requests with sizes
up to 1023 bytes.
Test 28 in the benchmark program tests word aligned requests in DRAM
with sizes up to 1024 bytes.
Test 29 in the benchmark program tests word aligned requests in DRAM
with sizes up to 256 bytes.
Test 43 in the benchmark program tests page aligned requests in DRAM
of size 4096 (copying a memory page).

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,35 @@
extern void *memcpy_new_line_size_64_preload_192(void *dest,
const void *src, size_t n);
extern void *memcpy_new_line_size_64_preload_192_align_32(void *dest,
const void *src, size_t n);
extern void *memcpy_new_line_size_64_preload_192_aligned_access(void *dest,
const void *src, size_t n);
extern void *memcpy_new_line_size_32_preload_192(void *dest,
const void *src, size_t n);
extern void *memcpy_new_line_size_32_preload_192_align_32(void *dest,
const void *src, size_t n);
extern void *memcpy_new_line_size_32_preload_96(void *dest,
const void *src, size_t n);
extern void *memcpy_new_line_size_32_preload_96_aligned_access(void *dest,
const void *src, size_t n);
extern void *memcpy_new_neon_line_size_64(void *dest, const void *src, size_t n);
extern void *memcpy_new_neon_line_size_32(void *dest, const void *src, size_t n);
extern void *memcpy_new_neon_line_size_32_auto(void *dest, const void *src, size_t n);
extern void *memset_new_align_0(void *dest, int c, size_t size);
extern void *memset_new_align_8(void *dest, int c, size_t size);
extern void *memset_new_align_32(void *dest, int c, size_t size);
extern void *memset_neon(void *dest, int c, size_t size);

View File

@@ -66,6 +66,10 @@ if !HAVE_MJPEGTOOLS
libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/mjpegtools -lmjpegutils
endif
if HAVE_ARM
libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/fastarm -lfastarm
endif
libveejay_la_LDFLAGS += $(SDL_LIBS) $(SDL_TTF_LIBS) $(DIRECTFB_LIBS) $(X_LIBS) $(PTHREAD_LIBS) $(FT_LDFLAGS) $(FT_LIBS) \
$(XML2_LIBS) $(JPEG_LIBS) $(LIBLO_LIBS) $(LIBUNWIND_LIBS) $(GLIB_LIBS) \
$(FFMPEG_LIBS) $(XINERAMA_LIBS) $(MJPEGTOOLS_LIBS) $(LIBPNG_LIBS) \

View File

@@ -788,7 +788,7 @@ int main(int argc, char **argv)
{
veejay_free(info);
return 0;
}
}
print_license();