From 1eb4ff18ebe5ff6989c69afef6bd8da384019be0 Mon Sep 17 00:00:00 2001 From: c0ntrol Date: Sat, 13 Feb 2016 18:53:36 +0100 Subject: [PATCH] add fastarm memcpy --- veejay-current/veejay-server/configure.ac | 50 +- .../veejay-server/libvjmem/memcpy.c | 267 +-- .../veejay-server/libvjmem/vj-x86.c | 4 - .../veejay-server/thirdparty/Makefile.am | 5 + .../thirdparty/fastarm/Makefile.am | 10 + .../veejay-server/thirdparty/fastarm/README | 97 + .../thirdparty/fastarm/new_arm.S | 1858 +++++++++++++++++ .../thirdparty/fastarm/new_arm.h | 35 + .../veejay-server/veejay/Makefile.am | 4 + veejay-current/veejay-server/veejay/veejay.c | 2 +- 10 files changed, 2172 insertions(+), 160 deletions(-) create mode 100644 veejay-current/veejay-server/thirdparty/fastarm/Makefile.am create mode 100644 veejay-current/veejay-server/thirdparty/fastarm/README create mode 100644 veejay-current/veejay-server/thirdparty/fastarm/new_arm.S create mode 100644 veejay-current/veejay-server/thirdparty/fastarm/new_arm.h diff --git a/veejay-current/veejay-server/configure.ac b/veejay-current/veejay-server/configure.ac index 99b59584..8fa62cf2 100644 --- a/veejay-current/veejay-server/configure.ac +++ b/veejay-current/veejay-server/configure.ac @@ -376,6 +376,27 @@ esac CFLAGS="$CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES" +if test x$host_alias != x; then + dnl Cross compiling + AC_MSG_CHECKING(sub-architecture settings) + if test x$have_x86cpu = xtrue; then + host_mod_cpu=`echo $host_cpu|tr _ -` + ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu" + AC_MSG_RESULT($ARCHFLAGS) + fi +else + AC_MSG_CHECKING(sub-architecture settings) + + chmod +x $srcdir/cpuinfo.sh + + if test "$arch_target" = "auto"; then + TMP=`$srcdir/cpuinfo.sh` + ARCHFLAGS=`cat veejay.arch` + else + ARCHFLAGS="-mtune=generic" + fi + AC_MSG_RESULT($ARCHFLAGS) +fi dnl ARM architecture detect NEON and set CFLAGS if test x$have_arm = xtrue @@ -392,8 +413,11 @@ then if test $ac_cv_flag_neon = yes ; then AC_DEFINE(HAVE_ARM_NEON,1,[Compiling in NEON support]) USER_CFLAGS="-mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad $USER_CFLAGS" + FASTARM_CFLAGS="$ARCHFLAGS -Wa,-march=armv7-a -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB" + else USER_CFLAGS="-march=native -ftree-vectorize $USER_CFLAGS" + FASTARM_CFLAGS="$ARCHFLAGS -Wa, -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB" fi if test "x$enable_debug" != "xyes" ; then @@ -406,6 +430,8 @@ then SUBSAMPLE_CFLAGS="$USER_CFLAGS" VJE_CFLAGS="$USER_CFLAGS" CFLAGS="$USER_CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES" + + AC_SUBST(FASTARM_CFLAGS) fi dnl This flag is used for PROGRAMS not SHARED LIBRARIES. PIC code is required @@ -584,28 +610,6 @@ EOF fi fi -if test x$host_alias != x; then - dnl Cross compiling - AC_MSG_CHECKING(sub-architecture settings) - if test x$have_x86cpu = xtrue; then - host_mod_cpu=`echo $host_cpu|tr _ -` - ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu" - AC_MSG_RESULT($ARCHFLAGS) - fi -else - AC_MSG_CHECKING(sub-architecture settings) - - chmod +x $srcdir/cpuinfo.sh - - if test "$arch_target" = "auto"; then - TMP=`$srcdir/cpuinfo.sh` - ARCHFLAGS=`cat veejay.arch` - else - ARCHFLAGS="-mtune=generic" - fi - AC_MSG_RESULT($ARCHFLAGS) -fi - have_mjpegtools=false AC_SUBST(MJPEGTOOLS_CFLAGS) AC_SUBST(MJPGETOOLS_LIBS) @@ -1074,6 +1078,7 @@ AM_CONDITIONAL(HAVE_JPEG,test x$have_jpeg = xtrue) AM_CONDITIONAL(HAVE_LIBLO,test x$have_liblo = xtrue) AM_CONDITIONAL(HAVE_FREETYPE2, test x$have_freetype2 = xtrue) AM_CONDITIONAL(HAVE_MJPEGTOOLS, test x$have_mjpegtools = xtrue ) +AM_CONDITIONAL(HAVE_ARM, test x$have_arm = xtrue ) dnl ********************************************************************* dnl Check for what warnings we want gcc to use and adjust the CFLAGS dnl as needed. This only works for GCC. @@ -1161,6 +1166,7 @@ fi AC_CONFIG_FILES([ thirdparty/Makefile +thirdparty/fastarm/Makefile thirdparty/aclib/Makefile thirdparty/bio2jack/Makefile thirdparty/libhash/Makefile diff --git a/veejay-current/veejay-server/libvjmem/memcpy.c b/veejay-current/veejay-server/libvjmem/memcpy.c index 184a6d18..5e4a9c13 100644 --- a/veejay-current/veejay-server/libvjmem/memcpy.c +++ b/veejay-current/veejay-server/libvjmem/memcpy.c @@ -141,6 +141,10 @@ #include #include #include +#ifdef HAVE_ARM +#include +#endif + #define BUFSIZE 1024 @@ -157,37 +161,12 @@ static int selected_best_memcpy = 1; static int selected_best_memset = 1; -#ifdef HAVE_POSIX_TIMERS -static int64_t _x_gettime(void) +static double get_time() { - struct timespec tm; - return (clock_gettime(CLOCK_THREAD_CPUTIME_ID,&tm) == -1 ) - ? times(NULL) - : (int64_t) tm.tv_sec * 1e9 + tm.tv_nsec; + struct timespec ts; + clock_gettime( CLOCK_REALTIME, &ts ); + return (double) ts.tv_sec + (double) ts.tv_nsec / 1000000000.0; } -#define rdtsc(x) _x_gettime() -#elif (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(HAVE_SYS_TIMES_H) -static int64_t rdtsc(int cpu_flags) -{ - int64_t x; - if( cpu_flags & AV_CPU_FLAGS_MMX ) { - __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x)); - return x; - } else { - return times(NULL); - } -} -#else -static uint64_t rdtsc(int cpu_flags) -{ -#ifdef HAVE_SYS_TIMES_H - struct tms tp; - return times(&tp); -#else - return clock(); -#endif -} -#endif /* HAVE_SYS_TIMES_H */ #if defined(ARCH_X86) || defined (ARCH_X86_64) /* for small memory blocks (<256 bytes) this version is faster */ @@ -251,7 +230,6 @@ void yuyv_plane_clear( size_t len, void *to ) if( vj_task_available() ) { uint8_t * t = (uint8_t*) to; uint8_t *in[4] = { t, NULL,NULL,NULL }; - int strides[4] = { len, 0,0,0 }; vj_task_run( in, in, NULL, NULL, 1, (performer_job_routine) &yuyv_plane_clear_job ); } else { @@ -1349,10 +1327,10 @@ static void *memcpy_neon( void *to, const void *from, size_t n ) static struct { - char *name; - void *(*function)(void *to, const void *from, size_t len); - uint64_t time; - uint32_t cpu_require; + char *name; + void *(*function)(void *to, const void *from, size_t len); + double t; + uint32_t cpu_require; } memcpy_method[] = { { NULL, NULL, 0}, @@ -1382,23 +1360,44 @@ static struct { #endif #ifdef HAVE_ARM_NEON { "NEON optimized memcpy()", (void*) memcpy_neon, 0, AV_CPU_FLAG_NEON }, +#endif +#ifdef HAVE_ARM + { "new mempcy for cortex with line size of 32, preload offset of 192 (C) Harm Hanemaaijer ", (void*) memcpy_new_line_size_32_preload_192,0,0 }, + { "new memcpy for cortex with line size of 64, preload offset of 192 (C) Harm Hanemaaijer " ,(void*) memcpy_new_line_size_64_preload_192, 0, 0 }, + { "new memcpy for cortex with line size of 64, preload offset of 192, aligned access (C) Harm Hanemaaijer ", (void*) memcpy_new_line_size_64_preload_192_aligned_access, 0, 0 }, + { "new memcpy for cortex with line size of 32, preload offset of 192, align 32", (void*) memcpy_new_line_size_32_preload_192_align_32,0,0}, + { "new memcpy for cortex with line size of 32, preload offset of 96", (void*) memcpy_new_line_size_32_preload_96,0,0}, + { "new memcpy for cortex with line size of 32, preload offset of 96, aligned access", (void*) memcpy_new_line_size_32_preload_96_aligned_access,0,0}, +#endif +#ifdef HAVE_ARM_NEON + { "new memcpy for cortex using NEON with line size of 32, preload offset of 192 (C) Harm Hanemaaijer ", (void*) memcpy_new_neon_line_size_32,0,AV_CPU_FLAG_NEON}, + { "new memcpy for cortex using NEON with line size of 64, preload offset of 192 (C) Harm Hanemaaijer ", (void*) memcpy_new_neon_line_size_64,0,AV_CPU_FLAG_NEON}, + { "new mempcy for cortex using NEON with line size of 32, automatic prefetcher (C) Harm Hanemaaijer ", (void*) memcpy_new_neon_line_size_32_auto,0,AV_CPU_FLAG_NEON}, #endif { NULL, NULL, 0}, }; static struct { - char *name; - void *(*function)(void *to, uint8_t c, size_t len); - uint64_t time; - uint32_t cpu_require; + char *name; + void *(*function)(void *to, uint8_t c, size_t len); + uint32_t cpu_require; + double t; } memset_method[] = { - { NULL, NULL, 0,0}, - { "glibc memset()", (void*)memset, 0,0}, + { NULL, NULL, 0,0}, + { "glibc memset()",(void*)memset,0,0}, #if defined(HAVE_ASM_MMX) || defined(HAVE_ASM_MMX2) || defined(HAVE_ASM_SSE) - { "MMX/MMX2/SSE optimized memset()", (void*) fast_memset, 0, AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2}, + { "MMX/MMX2/SSE optimized memset()", (void*) fast_memset,0,AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2 }, #endif - { NULL, NULL, 0,0}, +#ifdef HAVE_ARM_NEON + { "memset_neon (C) Harm Hanemaaijer ", (void*) memset_neon,0, AV_CPU_FLAG_NEON }, +#endif +#ifdef HAVE_ARM + { "memset align 0 (C) Harm Hanemaaijer ", (void*) memset_new_align_0,0,0 }, + { "memset align 8 (C) Harm Hanemaaijer ", (void*) memset_new_align_8,0,0 }, + { "memset align 32 (C) Harm Hanemaaijer ", (void*) memset_new_align_32,0,0 }, +#endif + { NULL, NULL, 0, 0}, }; @@ -1407,10 +1406,10 @@ void memcpy_report() int i; fprintf(stdout,"SIMD benchmark results:\n"); for( i = 1; memset_method[i].name; i ++ ) { - fprintf(stdout,"\t%8ld : %s\n",(long) memset_method[i].time, memset_method[i].name ); + fprintf(stdout,"\t%g : %s\n",memset_method[i].t, memset_method[i].name ); } for( i = 1; memcpy_method[i].name; i ++ ) { - fprintf(stdout,"\t%8ld : %s\n",(long) memcpy_method[i].time, memcpy_method[i].name ); + fprintf(stdout,"\t%g : %s\n",memcpy_method[i].t, memcpy_method[i].name ); } } @@ -1430,7 +1429,7 @@ char *get_memset_descr() void find_best_memcpy() { - uint64_t t; + double t; char *buf1, *buf2; int i, best = 0,k; int bufsize = 720 * 576 * 3; @@ -1445,6 +1444,8 @@ void find_best_memcpy() int cpu_flags = av_get_cpu_flags(); + veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memcpy ..." ); + memset(buf1,0, bufsize); memset(buf2,0, bufsize); @@ -1454,28 +1455,28 @@ void find_best_memcpy() for( i = 1; memcpy_method[i].name; i ++ ) { - t = rdtsc(cpu_flags); + t = get_time(); if( memcpy_method[i].cpu_require && !(cpu_flags & memcpy_method[i].cpu_require ) ) { - memcpy_method[i].time = 0; + memcpy_method[i].t = 0.0; continue; } for( k = 0; k < 128; k ++ ) { memcpy_method[i].function( buf1,buf2, bufsize ); } - t = rdtsc(cpu_flags) - t; - memcpy_method[i].time = t; + t = get_time() - t; + memcpy_method[i].t = t; } for( i = 1; memcpy_method[i].name; i ++ ) { if(best == 0 ) { best = i; - t = memcpy_method[i].time; + t = memcpy_method[i].t; continue; } - if( memcpy_method[i].time < t && memcpy_method[i].time > 0 ) { - t = memcpy_method[i].time; + if( memcpy_method[i].t < t && memcpy_method[i].t > 0 ) { + t = memcpy_method[i].t; best = i; } } @@ -1494,53 +1495,55 @@ void find_best_memcpy() void find_best_memset() { - uint64_t t; - char *buf1, *buf2; - int i, best = 0,k; + double t; + char *buf1, *buf2; + int i, best = 0,k; int bufsize = 720 * 576 * 3; int cpu_flags = av_get_cpu_flags(); - if (!(buf1 = (char*) malloc( bufsize * sizeof(char) ))) - return; + if (!(buf1 = (char*) malloc( bufsize * sizeof(char) ))) + return; + + if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) { + free( buf1 ); + return; + } + + veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memset..." ); - if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) { - free( buf1 ); - return; - } - memset( buf1, 0, bufsize * sizeof(char)); memset( buf2, 0, bufsize * sizeof(char)); - for (i=1; memset_method[i].name; i++) - { + for (i=1; memset_method[i].name; i++) + { if( memset_method[i].cpu_require && !(cpu_flags & memset_method[i].cpu_require ) ) { - memset_method[i].time= 0; + memset_method[i].t= 0; continue; - } + } - t = rdtsc(cpu_flags); - for( k = 0; k < 128; k ++ ) { - memset_method[i].function( buf1 , 0 , bufsize ); - } - t = rdtsc(cpu_flags) - t; - - memset_method[i].time = t; + t = get_time(); + for( k = 0; k < 128; k ++ ) { + memset_method[i].function( buf1 , 0 , bufsize ); + } + t = get_time() - t; + + memset_method[i].t = t; - if (best == 0 || t < memset_method[best].time) - best = i; - } + if (best == 0 || t < memset_method[best].t) + best = i; + } - if (best) { - veejay_memset = memset_method[best].function; - } else { - veejay_memset = memset_method[1].function; - } + if (best) { + veejay_memset = memset_method[best].function; + } + else { + veejay_memset = memset_method[1].function; + } selected_best_memset = best; - free( buf1 ); - free( buf2 ); - + free( buf1 ); + free( buf2 ); } static void vj_frame_copy_job( void *arg ) { @@ -1721,122 +1724,118 @@ void vj_frame_clear1( uint8_t *input, unsigned int val, int size ) vj_frame_clear( in, strides, val ); } -static unsigned long benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes) +static double benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes) { - uint64_t k; - uint64_t stats[c]; + int k; + double stats[c]; uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] ); for( k = 0; k < c; k ++ ) { - uint64_t t = rdtsc(0); + double t = get_time(); vj_frame_slow_single( source, source, dest, planes[0], planes[1]/2, 0.67f ); - t = rdtsc(0) - t; + t = get_time() - t; stats[k] = t; } - uint64_t sum = 0; + double sum = 0.0; for( k = 0; k < c ;k ++ ) sum += stats[k]; - uint64_t best_time = (sum / c ); + double best_time = (sum / c ); - veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms", - (float)(bytes /1048576.0f), (best_time/1000.0f)); + veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time); return best_time; } -static unsigned long benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes) +static double benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes) { - uint64_t k; - uint64_t stats[c]; + int k; + double stats[c]; uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] ); for( k = 0; k < c; k ++ ) { - uint64_t t = rdtsc(0); + uint64_t t = get_time(); vj_frame_slow_threaded( source, source, dest, planes[0], planes[1]/2, 0.67f ); - t = rdtsc(0) - t; + t = get_time() - t; stats[k] = t; } - uint64_t sum = 0; + double sum = 0.0; for( k = 0; k < c ;k ++ ) sum += stats[k]; - uint64_t best_time = (sum / c ); + double best_time = (sum / c ); - veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms", - (float)(bytes /1048576.0f), (best_time/1000.0f)); + veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time); return best_time; } -static unsigned long benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes) +static double benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes) { - uint64_t k; - uint64_t stats[c]; + int k; + double stats[c]; uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] ); for( k = 0; k < c; k ++ ) { - uint64_t t = rdtsc(0); + double t = get_time(); vj_frame_copyN( source,dest,planes ); - t = rdtsc(0) - t; + t = get_time() - t; stats[k] = t; } - uint64_t sum = 0; + double sum = 0.0; for( k = 0; k < c ;k ++ ) sum += stats[k]; - uint64_t best_time = (sum / c ); + double best_time = (sum / c ); - veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms", - (float)(bytes /1048576.0f), (best_time/1000.0f)); + veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time); return best_time; } -static unsigned long benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes) +static double benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes) { - uint64_t k; int j; - uint64_t stats[c]; + int k; int j; + double stats[c]; uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] ); for( k = 0; k < c; k ++ ) { - uint64_t t = rdtsc(0); + double t = get_time(); for( j = 0; j < 4; j ++ ) { veejay_memcpy( dest[j], source[j], planes[j] ); } - t = rdtsc(0) - t; + t = get_time() - t; stats[k] = t; } - uint64_t sum = 0; + double sum = 0.0; for( k = 0; k < c; k ++ ) sum += stats[k]; - uint64_t best_time = (sum/c); + double best_time = (sum/c); - veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms", - (float)(bytes /1048576.0f), (best_time/1000.0f)); + veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time); return best_time; } -typedef unsigned long (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes); +typedef double (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes); void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames, uint8_t **dest, uint8_t **source, int *planes ) { - uint32_t N = 8; - uint64_t stats[N]; + int N = 8; + double stats[N]; uint32_t i; - uint64_t fastest = 0; + double fastest = 0.0; float work_size = (planes[0] + planes[1] + planes[2] + planes[3]) / 1048576.0f; veejay_msg(VEEJAY_MSG_INFO, "run test '%s' (%dx) on chunks of %2.2f MB:", str, N, work_size ); @@ -1848,8 +1847,8 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames, fastest = stats[i]; } - uint64_t sum = 0; - uint64_t slowest=fastest; + double sum = 0.0; + double slowest=fastest; for( i = 0; i < N; i ++ ) { if( stats[i] < fastest ) { @@ -1860,8 +1859,7 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames, float average = (sum / N); - veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %2.4f ms, worst is %2.4f ms, average is %2.4f ms", - str, fastest/1000.0f, slowest/1000.0f, average/1000.0f ); + veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %g, worst is %g, average is %g",str, fastest, slowest, average ); } void benchmark_tasks(int n_tasks, long n_frames, int w, int h) @@ -1914,16 +1912,19 @@ void benchmark_veejay(int w, int h) if( h < 64) h = 64; + veejay_msg(VEEJAY_MSG_INFO, "Starting benchmark %dx%d YUVP 4:2:2 (100 frames)", w,h); + int n_tasks = task_num_cpus(); - init_parallel_tasks( n_tasks ); char *str2 = getenv( "VEEJAY_MULTITHREAD_TASKS" ); if( str2 ) { n_tasks = atoi(str2); } - - int n_frames = 100; - veejay_msg(VEEJAY_MSG_INFO, "Benchmark %dx%d YUVP 4:2:2 (%d frames)", w,h,n_frames); - benchmark_tasks( n_tasks, n_frames,w,h ); + + veejay_msg(VEEJAY_MSG_INFO, "VEEJAY_MULTITHREAD_TASKS=%d", n_tasks ); + + init_parallel_tasks( n_tasks ); + + benchmark_tasks( n_tasks,100,w,h ); } void *vj_hmalloc(size_t sze, const char *name) diff --git a/veejay-current/veejay-server/libvjmem/vj-x86.c b/veejay-current/veejay-server/libvjmem/vj-x86.c index d6b9970c..2bf92ed6 100644 --- a/veejay-current/veejay-server/libvjmem/vj-x86.c +++ b/veejay-current/veejay-server/libvjmem/vj-x86.c @@ -183,10 +183,6 @@ int vj_mem_threaded_init(int w, int h) num_tasks = n_cpus; if( num_tasks < 1 ) num_tasks = 1; - - if( num_tasks > 1 ) - veejay_msg( VEEJAY_MSG_INFO, "Using %d threads scheduled over %d cpus in performer.", num_tasks, n_cpus-1 ); - } } diff --git a/veejay-current/veejay-server/thirdparty/Makefile.am b/veejay-current/veejay-server/thirdparty/Makefile.am index 952e0cdb..71fb877a 100644 --- a/veejay-current/veejay-server/thirdparty/Makefile.am +++ b/veejay-current/veejay-server/thirdparty/Makefile.am @@ -3,3 +3,8 @@ SUBDIRS = aclib bio2jack libhash liblzo libOSC libresample if !HAVE_MJPEGTOOLS SUBDIRS += mjpegtools endif + +if HAVE_ARM +SUBDIRS += fastarm +endif + diff --git a/veejay-current/veejay-server/thirdparty/fastarm/Makefile.am b/veejay-current/veejay-server/thirdparty/fastarm/Makefile.am new file mode 100644 index 00000000..1381cee1 --- /dev/null +++ b/veejay-current/veejay-server/thirdparty/fastarm/Makefile.am @@ -0,0 +1,10 @@ +# Makefile for veejay +MAINTAINERCLEANFILES = Makefile.in +AM_CFLAGS = $(FASTARM_CFLAGS) +AM_CPPFLAGS = -I$(top_srcdir) -I$(includedir) \ + -I$(top_srcdir)/thirdparty $(FASTARM_CFLAGS) + +FASTARM_LIB_FILE = libfastarm.la +noinst_LTLIBRARIES = $(FASTARM_LIB_FILE) +libfastarm_la_SOURCES = new_arm.S +EXTRA_DIST= diff --git a/veejay-current/veejay-server/thirdparty/fastarm/README b/veejay-current/veejay-server/thirdparty/fastarm/README new file mode 100644 index 00000000..d48eba41 --- /dev/null +++ b/veejay-current/veejay-server/thirdparty/fastarm/README @@ -0,0 +1,97 @@ +fastarm + +This toolkit contains a set of fast memcpy/memset variants for ARM +platforms. They either use the standard register file, or optionally +NEON instructions, + +Several basic families of variants are provided; the current ones are +the "new memcpy" variants which are the default for memcpy replacement, +which generally do not overfetch beyond the source region and can be +configured to use unaligned memory access for small sizes, or to use +strictly aligned memory access. This family can also be configured to +include a fast path for smaller sizes (this is the default), disabling +this results in smaller code size at the expense of worse performance +for small sizes. NEON optimized versions, which are generally faster +with reduced code size, are also provided. + +To compile the benchmark program, run 'make'. This will compile in a +plethora of variants with different preload strategies, block sizes, +alignment etc. + +A benchmark program to compare various memcpy variants is provided. Try +something like "./benchmark --memcpy ad --all". (Use --memcpy al on the +Raspberry Pi platform). + +To compile a memcpy replacement library, set PLATFORM to one of the +values described at the beginning of the Makefile. This selects the +cache line size to use and whether to use NEON versions. + +Optionally disable Thumb2 mode compilation by commenting out the THUMBFLAGS +definition. It must be disabled on the Raspberry Pi. + +Then run: + + sudo make install_memcpy_replacement + +The replacement memcpy/memset shared library will be installed into +/usr/lib/arm-linux-gnueabihf/ as libfastarm.so. + +To enable the use of the replacement memcpy in applications, create or edit +the file /etc/ld.so.preload so that it contains the line: + + /usr/lib/arm-linux-gnueabihf/libfastarm.so + +On the RPi platform, references to libcofi_rpi.so should be commented out +or deleted. The new memcpy should now be activated for newly launched +programs. To be sure, reboot or run: + + sudo ldconfig + +To revert to the default optimized memcpy on the RPi platform, +edit /etc/ld.so.preload so that it contains the line: + + /usr/lib/arm-linux-gnueabihf/libcofi_rpi.so + +instead of the one using libfastarm.so. + +Note on cache line size: + +Although assuming a preload line size 64 bytes is a little faster on several +Cortex platforms for small to moderate sizes, when accessing DRAM +with larger sizes assuming 32 byte preloads seems to be faster. On earlier +Cortex A9 models, 32 byte preloads are required for good performance in all +cases. + +Notes on performance with and without NEON: + +For NEON-based memcpy, a significant benefit is seen on the tested Cortex A8 +platform for unaligned copies in cache memory and for aligned and unaligned +copies in DRAM. Performance for aligned copies in cache memory is relatively +similar to the optimized non-NEON function. + +Results in MB/s on a Cortex A8, with Thumb2 mode enabled, of +standard libc (Debian unstable), armv7 and NEON optimized memcpy +variants with line size of 32 bytes: + + libc armv7 NEON +test 0 522 549 567 +test 1 329 377 378 +test 2 434 430 513 +test 28 351 361 458 +test 29 246 248 358 +test 43 467 512 581 + +Test 0 in the benchmark program tests word-aligned requests with +sizes that are a power of 2 up to 4096 bytes distributed according +to a power law. +Test 1 in the benchmark program tests word-aligned requests with +sizes up to 1024 that are a multiple of 4, distributed according +to a power law. +Test 2 in the benchmark program tests unaligned requests with sizes +up to 1023 bytes. +Test 28 in the benchmark program tests word aligned requests in DRAM +with sizes up to 1024 bytes. +Test 29 in the benchmark program tests word aligned requests in DRAM +with sizes up to 256 bytes. +Test 43 in the benchmark program tests page aligned requests in DRAM +of size 4096 (copying a memory page). diff --git a/veejay-current/veejay-server/thirdparty/fastarm/new_arm.S b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.S new file mode 100644 index 00000000..fd3a3a14 --- /dev/null +++ b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.S @@ -0,0 +1,1858 @@ +/* + * Copyright 2013 Harm Hanemaaijer + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ + +#ifdef CONFIG_THUMB +#define W(instr) instr.w +#define THUMB(instr...) instr +#define ARM(instr...) +#else +#define W(instr) instr +#define THUMB(instr...) +#define ARM(instr...) instr +#endif + +/* + * In practice, because the way NEON is configured on most systems, + * specifying alignment hints for NEON instructions doesn't seem + * to improve performance, or even degrade performance in some cases. + * However, actually having the address aligned to an element + * boundary or greater is beneficial. + */ +#define NEON_ALIGN(n) +/* #define NEON_ALIGN(n) :n */ + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +.text +.syntax unified +.arch armv7a +.fpu neon + +.macro asm_function function_name + .global \function_name +.func \function_name +.type \function_name, function +ARM( .p2align 5 ) +THUMB( .p2align 2 ) +\function_name: +.endm + +/* + * The following memcpy implementation is optimized with a fast path + * for common, word aligned cases and optionally use unaligned access for + * small sizes. + * + * - line_size is the cache line size used for prefetches. Must be 64 or 32. + * - prefetch_distance is the number of cache lines to look ahead and must be + * >= 2. + * - write_align is the write alignment enforced before the main loop for larger + * sizes (word aligned case) and must be 0, 16, 32, or 64. + * - aligned_access must be 0 or 1. When enabled, no unaligned memory accesses + * will occur. Both small size tresholds for unaligned access are not used + * in this case. + */ + +/* The threshold size for using the fast path for the word-aligned case. */ +#define FAST_PATH_THRESHOLD 256 +/* The threshold size for using the small size path for the word-aligned case. */ +#define SMALL_SIZE_THRESHOLD 15 +/* + * The threshold size for using the small size path for the unaligned case. + * Unaligned memory accesses will be generated for requests smaller or equal to + * this size. + */ +#define UNALIGNED_SMALL_SIZE_THRESHOLD 64 +/* + * The threshold size for using the small size path when both the source and + * the destination are unaligned. Unaligned memory accesses will be generated + * for requests smaller of equal to this size. + */ +#define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 32 + +/* + * For a code-reduced version, define all four of the above constants to 0, + * eliminating the fast path and small size special cases. With Thumb2 + * enabled, this resulted in a reduction in code size from 1150 to 824 bytes, + * at the cost of lower performance for smaller sizes. + */ +// #define FAST_PATH_THRESHOLD 0 +// #define SMALL_SIZE_THRESHOLD 0 +// #define UNALIGNED_SMALL_SIZE_THRESHOLD 0 +// #define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 0 + +/* + * EARLY_PREFETCHES is used in the fast path implementation. + * The optimal value for EARLY_PREFETCHES was determined empirically. + * It is equal to prefetch_distance + 1 for line_size 32. + * and prefetch_distance - 1 for line_size 64. + */ +#define EARLY_PREFETCHES (\prefetch_distance - (\line_size / 32) * 2 + 3) + +#if FAST_PATH_THRESHOLD > 0 +#define FAST_PATH(instr...) instr +#define NO_FAST_PATH(instr...) +#else +#define FAST_PATH(instr...) +#define NO_FAST_PATH(instr...) instr +#endif + + +/* Helper macro for the fast-path implementation. */ + +.macro copy_16_bytes bytes_to_go, line_size, prefetch_distance +#ifdef CONFIG_THUMB + /* + * When Thumb2 mode is enabled, the ldmia/stmia instructions + * will be 16-bit, and the preload instruction will be + * 32-bit, so we only need one 32-bit wide nop instruction + * when there's no preload, for a total size of two words. + */ +.if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \ + (\bytes_to_go % \line_size) == 0 + pld [r1, ip] + ldmia r1!, {r3, r4, r5, r6} + stmia r0!, {r3, r4, r5, r6} +.else + ldmia r1!, {r3, r4, r5, r6} + W( nop ) + stmia r0!, {r3, r4, r5, r6} +.endif +#else + /* + * When ARM mode is enabled, every instruction is one word, + * so make sure the entire block is four instructions. + */ +.if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \ +(\bytes_to_go % \line_size) == 0 + pld [r1, ip] +.else + nop +.endif + ldmia r1!, {r3, r4, r5, r6} + nop + stmia r0!, {r3, r4, r5, r6} +#endif +.endm + + +/* Helper macro implementing unaligned copy. */ + +.macro unaligned_copy shift, line_size, prefetch_distance, write_align, \ +aligned_access + /* + * ip is the aligned source base address. + * r3 is a word of data from the source. + */ +.if \write_align > 0 + cmp r2, #(32 + \write_align - 4) +.else + cmp r2, #32 +.endif + push {r5} + blt 55f + subs r2, r2, #32 + + /* Handle write alignment. */ +.if \write_align > 0 +.if \write_align == 8 + tst r0, #4 + mov r4, r3, lsr #\shift + ldrne r3, [r1], #4 + subne r2, r2, #4 + orrne r4, r4, r3, lsl #(32 - \shift) + strne r4, [r0], #4 +.else + ands r5, r0, #(\write_align - 1) + rsb r5, r5, #\write_align + beq 59f + sub r2, r2, r5 + +58: movs r4, r3, lsr #\shift + ldr r3, [r1], #4 + subs r5, r5, #4 + orr r4, r4, r3, lsl #(32 - \shift) + str r4, [r0], #4 + bgt 58b +59: +.endif +.endif + + /* + * Assume a preload at aligned base + line_size will + * be useful. + */ + pld [ip, #\line_size] + push {r6-r11} + mov r11, r3 + + mov r4, ip + add r5, r1, #(\prefetch_distance * \line_size) + subs r2, r2, #(\prefetch_distance * \line_size) + bic r3, r5, #31 + add r4, r4, #(2 * \line_size) + blt 54f + cmp r4, r3 + sub ip, r3, r1 + /* + * "Catch-up" the early preloads (which have been performed up + * to aligned source address + line_size) to the preload offset + * used in the main loop. + */ + bge 52f +51: adds r4, r4, #\line_size /* Thumb16 */ + cmp r4, r3 + pld [r4, #(- \line_size)] + blt 51b +52: + /* + * Note that when L1_CACHE_BYTES is 64, we are + * prefetching every 32 bytes. Although not optimal + * there doesn't seem to be big penalty for the extra + * preload instructions and it prevents greater + * code size and complexity. + */ +53: pld [r1, ip] +54: + ldmia r1!, {r4-r7} + mov r3, r11, lsr #\shift + ldmia r1!, {r8-r11} + orr r3, r3, r4, lsl #(32 - \shift) + movs r4, r4, lsr #\shift /* Thumb16 */ + orr r4, r4, r5, lsl #(32 - \shift) + movs r5, r5, lsr #\shift /* Thumb16 */ + orr r5, r5, r6, lsl #(32 - \shift) + movs r6, r6, lsr #\shift /* Thumb16 */ + orr r6, r6, r7, lsl #(32 - \shift) + movs r7, r7, lsr #\shift /* Thumb16 */ + orr r7, r7, r8, lsl #(32 - \shift) + mov r8, r8, lsr #\shift + orr r8, r8, r9, lsl #(32 - \shift) + mov r9, r9, lsr #\shift + orr r9, r9, r10, lsl #(32 - \shift) + mov r10, r10, lsr #\shift + orr r10, r10, r11, lsl #(32 - \shift) + subs r2, r2, #32 + stmia r0!, {r3-r10} + bge 53b + cmn r2, #(\prefetch_distance * \line_size) + bge 54b + /* Correct the count. */ + adds r2, r2, #(\prefetch_distance * \line_size + 32) + + mov r3, r11 + pop {r6-r11} + +55: bics r5, r2, #3 + beq 57f + +56: movs r4, r3, lsr #\shift + ldr r3, [r1], #4 + subs r5, r5, #4 + orr r4, r4, r3, lsl #(32 - \shift) + str r4, [r0], #4 + bgt 56b + +57: pop {r5} + pop {r4} + subs r1, r1, #((32 - \shift) / 8) +.if \aligned_access == 1 + b 7b +.else + b 3b +.endif +.endm + + +/* The main memcpy function macro. */ + +.macro memcpy_variant line_size, prefetch_distance, write_align, \ +aligned_access + +.if \aligned_access == 1 + cmp r2, #3 +.else +NO_FAST_PATH( cmp r2, #3 ) +.endif + orr r3, r0, r1 +.if \aligned_access == 1 + push {r0} + ble 7f +.else +NO_FAST_PATH( push {r0} ) +NO_FAST_PATH( ble 3f ) +.endif + bic ip, r1, #(\line_size - 1) + tst r3, #3 + pld [ip] +.if \aligned_access == 1 +FAST_PATH( bne 30f ) +.else +FAST_PATH( push {r0} ) +FAST_PATH( bne 7f ) /* Unaligned source or destination. */ +.endif +FAST_PATH( cmp r2, #FAST_PATH_THRESHOLD ) +FAST_PATH( bgt 10f ) +NO_FAST_PATH( bne 30f ) +#if FAST_PATH_THRESHOLD == 0 + /* + * When the fast path is disabled, check whether there are + * enough bytes for alignment, and jump to the main handling + * code for larger sizes. + */ +.if \write_align > 0 + cmp r2, #(\write_align - 4) + bge 10f +.endif + push {r4} + b 18f +#endif + + /* + * Fast path for aligned copies of size <= FAST_PATH_THRESHOLD. + */ +#if FAST_PATH_THRESHOLD > 0 +#if SMALL_SIZE_THRESHOLD == 15 + bics r3, r2, #15 + pld [ip, #\line_size] + /* Jump for small sizes <= 15 bytes. */ + beq 5f +#else + cmp r2, #SMALL_SIZE_THRESHOLD + pld [ip, #\line_size] + /* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */ + ble 5f + bic r3, r2, #15 +#endif + +9: /* + * This is the entry-point into the fast path from + * an unaligned request that has been aligned. + */ + push {r4, r5, r6} + + /* + * Use a heuristic to determine whether the preload + * at aligned_base + 2 * line_size will be useful. + */ +.if EARLY_PREFETCHES >= 3 + cmp r2, #(2 * \line_size - \line_size / 2) +.endif + add r5, ip, #(EARLY_PREFETCHES * \line_size) +.if EARLY_PREFETCHES >= 3 + blt 1f +.endif +.if EARLY_PREFETCHES == 3 + pld [ip, #(2 * \line_size)] ) +.endif +.if EARLY_PREFETCHES == 4 + cmp r2, #(3 * \line_size - \line_size / 2) + pld [ip, #(2 * \line_size)] + blt 1f + pld [ip, #(3 * \line_size)] +.endif +.if EARLY_PREFETCHES == 5 + cmp r2, #(3 * \line_size - \line_size / 2) + pld [ip, #(2 * \line_size)] + blt 1f + cmp r2, #(4 * \line_size - \line_size / 2) + pld [ip, #(3 * \line_size)] + blt 1f + pld [ip, #(4 * \line_size)] +.endif + +1: /* + * Set r5 so that the next preload will occur + * exactly at aligned_base + EARLY_PREFETCHES * + * line_size. For example, if line_size is 64 + * and the number of bytes is 240, the next preload + * will occur after processing 48 bytes, which is derived + * from the formula r3 & (line_size - 1), + * where r3 is equal to number_of_bytes & (~15). + */ + rsb r4, r3, #256 + subs r5, r5, r1 + and ip, r3, #(\line_size - 1) + subs r2, r2, r3 /* Thumb16 */ +THUMB( lsrs r4, r4, #1 ) /* Thumb16 */ + sub ip, r5, ip + add pc, pc, r4 + nop + /* >= 256 bytes to go. */ + copy_16_bytes 256, \line_size, \prefetch_distance + /* >= 240 bytes go. */ + copy_16_bytes 240, \line_size, \prefetch_distance + /* >= 224 bytes to go. */ + copy_16_bytes 224, \line_size, \prefetch_distance + /* >= 204 bytes go. */ + copy_16_bytes 204, \line_size, \prefetch_distance + /* >= 192 bytes to go. */ + copy_16_bytes 192, \line_size, \prefetch_distance + /* >= 176 bytes go. */ + copy_16_bytes 176, \line_size, \prefetch_distance + /* >= 160 bytes to go. */ + copy_16_bytes 160, \line_size, \prefetch_distance + /* >= 144 bytes go. */ + copy_16_bytes 144, \line_size, \prefetch_distance + /* >= 128 bytes to go. */ + copy_16_bytes 128, \line_size, \prefetch_distance + /* >= 112 bytes go. */ + copy_16_bytes 112, \line_size, \prefetch_distance + /* >= 96 bytes to go. */ + copy_16_bytes 96, \line_size, \prefetch_distance + /* >= 80 bytes to go. */ + copy_16_bytes 80, \line_size, \prefetch_distance + /* >= 64 bytes to go. */ + copy_16_bytes 64, \line_size, \prefetch_distance + /* >= 48 bytes to go. */ + copy_16_bytes 48, \line_size, \prefetch_distance + /* >= 32 bytes to go. */ + copy_16_bytes 32, \line_size, \prefetch_distance + /* At this point there are 16 to 31 bytes to go. */ + tst r2, #15 + ldmia r1!, {r3, r4, r5, r6} + cmpne r2, #8 + /* + * If r2 == 8, we need to clear the eq flag while + * making sure carry remains set. + */ + tsteq r2, #15 + stmia r0!, {r3, r4, r5, r6} + /* + * The equal flag is set if there are no bytes left. + * The carry flag is set is there are >= 8 bytes left. + */ + pop {r4, r5, r6} + beq 4f + +2: + /* + * ARM mode imposes restrictions on the registers used + * in double-word loads and stored so we have to use + * single-word operations. + */ +.if \aligned_access == 0 + ARM( ldrcs r3, [r1], #4 ) + ARM( ldrcs ip, [r1], #4 ) + ARM( strcs r3, [r0], #4 ) + ARM( strcs ip, [r0], #4 ) + THUMB( ldrdcs r3, ip, [r1], #8 ) + THUMB( strdcs r3, ip, [r0], #8 ) +.else + ldrcs r3, [r1], #4 + ldrcs ip, [r1], #4 + strcs r3, [r0], #4 + strcs ip, [r0], #4 +.endif + tst r2, #4 + ldrne ip, [r1], #4 + strne ip, [r0], #4 + tst r2, #3 + popeq {r0} + bxeq lr + + /* + * Handle the last up to three bytes. Unaligned access + * make take place if source or destination is not + * half-word aligned. + */ +3: movs r2, r2, lsl #31 + ldrhcs r3, [r1], #2 + strhcs r3, [r0], #2 + ldrbne r3, [r1], #1 + strbne r3, [r0], #1 +4: pop {r0} + bx lr + +5: /* + * Sizes <= SMALL_SIZE_THRESHOLD bytes, both source and + * destination aligned. + */ +#if SMALL_SIZE_THRESHOLD <= 15 + cmp r2, #8 /* cs if r2 >= 8. */ + b 2b +#else +101: tst r2, #4 + ldrne r3, [r1], #4 + subne r2, r2, #4 + strne r3, [r0], #4 + cmp r2, #8 + blt 3b +6: cmp r2, #16 + ldr r3, [r1], #4 + ldr ip, [r1], #4 + str r3, [r0], #4 + sub r2, r2, #8 + str ip, [r0], #4 + bge 6b + cmp r2, #0 + popeq {r0} + bxeq lr + b 3b +#endif + +#endif /* FAST_PATH_THRESHOLD > 0 */ + +.if \aligned_access == 1 + /* + * Handle the last up to three bytes avoiding + * unaligned memory access. + */ +7: movs r2, r2, lsl #31 + ldrbcs r3, [r1], #1 + ldrbcs ip, [r1], #1 + strbcs r3, [r0], #1 + strbcs ip, [r0], #1 + ldrbne r3, [r1], #1 + strbne r3, [r0], #1 + pop {r0} + bx lr +.endif + +#if FAST_PATH_THRESHOLD > 0 +.if \aligned_access == 0 +7: /* + * Unaligned source or destination. There are seperate small + * size thresholds for when both source and destination are + * unaligned and the other case. + */ + tst r0, #3 + mov r3, #UNALIGNED_SMALL_SIZE_THRESHOLD + tstne r1, #3 + movne r3, #BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + cmp r2, r3 + bgt 30f + + /* Small sizes, unaligned case. Use single word load/stores. */ +#if SMALL_SIZE_THRESHOLD >= 16 + /* Use the identical code path already defined above. */ + b 101b +#else + tst r2, #4 + ldrne r3, [r1], #4 + subne r2, r2, #4 + strne r3, [r0], #4 + cmp r2, #8 + blt 3b +8: cmp r2, #16 + ldr r3, [r1], #4 + ldr ip, [r1], #4 + str r3, [r0], #4 + sub r2, r2, #8 + str ip, [r0], #4 + bge 8b + b 3b +#endif +.endif +#endif /* FAST_PATH_THRESHOLD > 0 */ + +10: /* + * This is the start of the handling of larger sizes for + * aligned copies. + * + * Size > FAST_PATH_THRESHOLD (256). + * ip is the line_sized aligned source address for preloads. + */ + +.if \write_align >= 16 + ands r3, r0, #(\write_align - 1) + push {r4} + rsb r3, r3, #\write_align + beq 17f + push {lr} + bl 20f + pop {lr} +17: +.elseif \write_align == 8 + /* + * For write alignment of 8, it is quickest to do a simple + * conditional load/store. + */ + tst r0, #4 + push {r4} + ldrne r3, [r1], #4 + subne r2, r2, #4 + strne r3, [r0], #4 +.else + push {r4} +.endif + +18: +.if (FAST_PATH_THRESHOLD - (\write_align - 4)) < \line_size + cmp r2, #\line_size + blt 15f +.endif + subs r2, r2, #\line_size + +16: /* + * This is the entry-point when source and destination were + * initially unaligned but are now aligned because they had + * the same alignment within a word. Write alignment and + * size check has already been handled. + */ + + push {r5-r11} + + /* + * Assume a preload at aligned base + line_size will + * be useful. + */ + mov r4, ip + pld [ip, #\line_size] + add r5, r1, #(\prefetch_distance * \line_size) + subs r2, r2, #(\prefetch_distance * \line_size) + bic r3, r5, #(\line_size - 1) + add r4, r4, #(2 * \line_size) + blt 14f + cmp r4, r3 + sub ip, r3, r1 + /* + * "Catch-up" the early preloads (which have been performed up + * to aligned source address + line_size) to the preload offset + * used in the main loop. + */ + bge 12f +11: adds r4, r4, #\line_size /* Thumb16 */ + cmp r4, r3 + pld [r4, #(- \line_size)] + blt 11b +12: + + /* + * The main loop for large sizes. Copy 32 bytes at a time + * using ldmia/stmia while prefetching a 32-byte aligned + * address for line size 32, or 64 bytes at a time while + * prefetching a 64-byte aligned address for line size 64. + */ +13: pld [r1, ip] +14: +.if \line_size == 32 + ldmia r1!, {r4-r7} + subs r2, r2, #32 + ldmia r1!, {r8-r11} + stmia r0!, {r4-r7} + stmia r0!, {r8-r11} +.else + ldmia r1!, {r4-r11} + subs r2, r2, #64 + stmia r0!, {r4-r11} + ldmia r1!, {r4-r11} + stmia r0!, {r4-r11} +.endif + bge 13b + cmn r2, #(\prefetch_distance * \line_size) + bge 14b + /* Correct the count. */ + adds r2, r2, #((\prefetch_distance + 1) * \line_size) + pop {r5-r11} + +15: ands r3, r2, #60 +.if \write_align <= 8 + /* + * When the subroutine is not used for write alignment, the + * subroutine will only be called once, so branch without + * linking. + */ + bne 20f +19: +.else + mov ip, lr + blne 20f + mov lr, ip +.endif + pop {r4} +#if FAST_PATH_THRESHOLD > 0 + cmp r2, #0 + bne 3b +#else + ARM( cmp r2, #0 ) + ARM( beq 4f ) + THUMB( cbz r2, 4f ) + /* Handle the last up to three bytes. */ +3: movs r2, r2, lsl #31 + ldrhcs r3, [r1], #2 + strhcs r3, [r0], #2 + ldrbne r3, [r1], #1 + strbne r3, [r0], #1 +4: +#endif + pop {r0} + bx lr + + /* + * Subroutine that copies a multiple of 4 bytes of size + * r3 from 0 to 64 or 32 bytes. r2 is decremented by the + * number of bytes copied. + */ +20: tst r3, #4 + sub r2, r2, r3 + ldrne r4, [r1], #4 + subne r3, r3, #4 + strne r4, [r0], #4 +.if \write_align <= 32 && \line_size == 32 + rsb r3, r3, #32 +.else + rsb r3, r3, #64 +.endif + /* + * These ldmia/stmia instructions are 16-bit on Thumb2, + * 32-bit on ARM. + */ + THUMB( lsrs r3, r3, #1 ) + add pc, pc, r3 + nop + ldmia r1!, {r3, r4} + stmia r0!, {r3, r4} + ldmia r1!, {r3, r4} + stmia r0!, {r3, r4} + ldmia r1!, {r3, r4} + stmia r0!, {r3, r4} + ldmia r1!, {r3, r4} + stmia r0!, {r3, r4} +.if \write_align > 32 || \line_size > 32 + ldmia r1!, {r3, r4} + stmia r0!, {r3, r4} + ldmia r1!, {r3, r4} + stmia r0!, {r3, r4} + ldmia r1!, {r3, r4} + stmia r0!, {r3, r4} + ldmia r1!, {r3, r4} + stmia r0!, {r3, r4} +.endif +.if \write_align <= 8 + b 19b +.else + mov pc, lr +.endif + +30: /* + * Unaligned case. Align the destination. + * Number of bytes is > UNALIGNED_SMALL_SIZE_THRESHOLD. + * Note: This may use unaligned access. + * ip is the line_size aligned source address for preloads. + */ + ands r3, r0, #3 + push {r4} + andeq r3, r1, #3 + beq 40f /* Destination is aligned but source is not. */ + /* Align the destination. */ + cmp r3, #2 +.if \aligned_access == 1 + ldrble r4, [r1], #1 + ldrble r3, [r1], #1 + suble r2, r2, #2 + strble r4, [r0], #1 + strble r3, [r0], #1 +.else + ldrhle r4, [r1], #2 + suble r2, r2, #2 + strhle r4, [r0], #2 +.endif + ldrbne r4, [r1], #1 + subne r2, r2, #1 + strbne r4, [r0], #1 + ands r3, r1, #3 + bne 40f /* Destination is aligned but source is not. */ + +#if 0 && FAST_PATH_THRESHOLD > 0 + /* + * Source and destination are now aligned. + * Now recreate the situation of a word-aligned memcpy + * with the current source and destination, + * which may require an extra preload instruction. + * + * This path is currently disabled disabled in favour + * of the one below this which does write alignment and + * jumps into the main loop for larger sizes. + */ + bic r3, r1, #(\line_size - 1) + pop {r4} + cmp r3, ip + THUMB( pldne [r3] ) + THUMB( cmp r2, #FAST_PATH_THRESHOLD ) + THUMB( mov ip, r3 ) + ARM( beq 31f ) + ARM( pld [r3] ) + ARM( mov ip, r3 ) +31: ARM( cmp r2, #FAST_PATH_THRESHOLD ) + bgt 10b + + /* + * Recreate the fast path small size check here, + * but only if it necessary. + */ +.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) <= SMALL_SIZE_THRESHOLD || +\aligned_access == 1 + cmp r2, #SMALL_SIZE_THRESHOLD + pld [ip, #\line_size] + /* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */ + ble 5b +.else + pld [ip, #\line_size] +.endif + bic r3, r2, #15 + b 9b + +#else + /* + * Source and destination are now aligned. Check carefully + * whether there are enough bytes to do alignment. + */ +.if \write_align > 0 +.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < (\write_align - 4) \ +|| \aligned_access == 1 + cmp r2, #(\write_align - 4) + blt 31f +.endif +.if \write_align == 8 + /* + * For write alignment of 8, it is quickest to do a simple + * conditional load/store. + */ + tst r0, #4 + ldrne r3, [r1], #4 + subne r2, r2, #4 + strne r3, [r0], #4 +.else + ands r3, r0, #(\write_align - 1) + rsb r3, r3, #\write_align + beq 31f + push {lr} + bl 20b + pop {lr} +.endif + +31: /* + * Check whether there are enough bytes to do one iteration + * of the main loop. + */ +.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3 - (\write_align - 4)) < \line_size \ +|| \aligned_access == 1 + cmp r2, #\line_size + blt 15b +.endif + subs r2, r2, #\line_size +.else + /* + * No write alignment. Only have to check for enough bytes to + * do one iteration of the main loop. + */ + +.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < \line_size \ +|| \aligned_access == 1 + cmp r2, #\line_size + blt 15b +.endif + subs r2, r2, #\line_size +.endif + b 16b +#endif + +40: /* + * Unaligned case. Size is > SMALL_SIZE_THRESHOLD - 3. + */ + bic r1, r1, #3 + cmp r3, #2 + ldr r3, [r1], #4 + beq 41f + bgt 42f + + unaligned_copy 8, \line_size, \prefetch_distance, \ + \write_align, \aligned_access + +41: unaligned_copy 16, \line_size, \prefetch_distance, \ + \write_align, \aligned_access + +42: unaligned_copy 24, \line_size, \prefetch_distance, \ + \write_align, \aligned_access + +.endm + +/* + * The following is a NEON-based memcpy implementation that may use unaligned + * access, but NEON instruction addresses are always at least element aligned. + * It is optimized for both Thumb2 (CONFIG_THUMB) and ARM mode. + * + * - line_size is the cache line size used for prefetches. Must be 64 or 32. + * - prefetch_distance is the number of cache lines to look ahead and must be + * >= 2, or 0 to disable prefetching in the main copying loop. + * - early_prefetch indicates whether to perform early preloads. Must be 0 or 1. + * When prefetch_distance > 0, early_prefetch should be 1. To remove all PLD + * instructions altogether, set both prefetch_distance and early_prefetch + * to 0. + */ + +.macro neon_memcpy_variant line_size, prefetch_distance, early_prefetch + + cmp r2, #3 +.if \prefetch_distance > 0 || \early_prefetch == 1 + push {r0} +.else + mov ip, r0 +.endif + orr r3, r0, r1 + ble 8f +.if \prefetch_distance > 0 || \early_prefetch == 1 + bic ip, r1, #(\line_size - 1) +.endif + tst r3, #3 +.if \early_prefetch == 1 + pld [ip] +.endif + bne 10f /* Unaligned source or destination. */ + push {r4} + + /* Aligned source and destination. */ +1: cmp r2, #256 + /* + * Jump to word-aligned NEON fast path <= 256 bytes. + */ + ble 18f + subs r2, r2, #\line_size + + /* Align to a 32-byte boundary. */ +#ifdef CONFIG_THUMB + /* + * Use conditional NEON instructions when + * available (Thumb2 mode) + */ + ands r4, r0, #31 + rsb r4, r4, #32 + beq 31f + tst r4, #4 + sub r2, r2, r4 + ldrne r3, [r1 :32], #4 + strne r3, [r0 :32], #4 + tst r4, #8 + vld1ne.32 {d0}, [r1]! + vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]! + cmp r4, #16 + vld1ge.32 {d2, d3}, [r1]! + vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]! +#else + /* + * Otherwise, branch into a series of single + * loads/stores. + */ + ands r4, r0, #31 + beq 31f + rsb r3, r4, #32 + lsl r4, r4, #1 + sub r2, r2, r3 + add pc, pc, r4 + nop + ldr r3, [r1], #4 + str r3, [r0], #4 + ldr r4, [r1], #4 + str r4, [r0], #4 + ldr r3, [r1], #4 + str r3, [r0], #4 + ldr r4, [r1], #4 + str r4, [r0], #4 + ldr r3, [r1], #4 + str r3, [r0], #4 + ldr r4, [r1], #4 + str r4, [r0], #4 + ldr r3, [r1], #4 + str r3, [r0], #4 + ldr r4, [r1], #4 + str r4, [r0], #4 +#endif + cmp r2, #0 + addlt r2, r2, \line_size + blt 6f + +31: +.if \early_prefetch == 1 + pld [ip, #\line_size] +.endif +.if \prefetch_distance > 0 + /* + * Assume a preload at aligned base + line_size will + * be useful. + */ + push {r5} + mov r4, ip + add r5, r1, #(\prefetch_distance * \line_size) + subs r2, r2, #(\prefetch_distance * \line_size) + bic r3, r5, #(\line_size - 1) + add r4, r4, #(2 * \line_size) + blt 5f + cmp r4, r3 + sub ip, r3, r1 + /* + * "Catch-up" the early preloads (which have been performed up + * to aligned source address + line_size) to the preload offset + * used in the main loop. + */ + bge 3f +2: adds r4, r4, #\line_size /* Thumb16 */ + cmp r4, r3 + pld [r4, #(- \line_size)] + blt 2b +3: +.endif + + sub ip, ip, #\line_size +4: + /* + * Since the destination is 32-byte aligned, + * specify 256-bit alignment for the NEON stores. + */ +.if \line_size == 32 + vld1.32 {d0-d3}, [r1]! + subs r2, r2, #32 +.if \prefetch_distance > 0 + pld [r1, ip] +.endif + vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! +.else /* line_size == 64 */ + vld1.32 {d0-d3}, [r1]! + vld1.32 {d4-d7}, [r1]! +.if \prefetch_distance > 0 + pld [r1, ip] +.endif + vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! + subs r2, r2, #64 + vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]! +.endif + bge 4b +.if \prefetch_distance > 0 +5: +.if \line_size == 32 + vld1.32 {d0-d3}, [r1]! + subs r2, r2, #32 + vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! +.else /* line_size == 64 */ + vld1.32 {d0-d3}, [r1]! + vld1.32 {d4-d7}, [r1]! + vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! + subs r2, r2, #64 + vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]! +.endif + cmn r2, #(\prefetch_distance * \line_size) + bge 5b +.endif + /* Correct the count. */ +23: adds r2, r2, #((\prefetch_distance + 1) * \line_size) +.if \prefetch_distance > 0 + pop {r5} +.endif + + /* + * Process the last 0-(line_size - 1) bytes, destination + * 32-byte aligned, source word aligned. + */ +6: +#ifdef CONFIG_THUMB + /* + * Use conditional NEON instructions when + * available (Thumb2 mode). + */ +.if \line_size == 64 + cmp r2, #32 + vld1ge.32 {d0-d3}, [r1]! + vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]! + tst r2, #16 + vld1ne.32 {d0, d1}, [r1]! + vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]! +.else + cmp r2, #16 + vld1ge.32 {d0, d1}, [r1]! + vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]! +.endif + tst r2, #8 + vld1ne.32 {d2}, [r1]! + vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]! + tst r2, #4 + ldrne r3, [r1], #4 + strne r3, [r0 :32], #4 + + pop {r4} +#else + /* + * Just use the world-aligned tail code if we + * don't have Thumb2. + */ + b 17f +#endif + + /* + * Handle the last up to three bytes. Unaligned access + * may take place if source or destination is not + * half-word aligned. + */ +8: movs r2, r2, lsl #31 + ldrhcs r3, [r1], #2 + strhcs r3, [r0], #2 + ldrbne r3, [r1], #1 + strbne r3, [r0] +9: +.if \prefetch_distance > 0 || \early_prefetch == 1 + pop {r0} +.else + mov r0, ip +.endif + bx lr + +10: /* + * Unaligned case. Align the destination. + * Number of bytes is > 3. + * Note: This may use unaligned access. + * ip is the line_size aligned source address for preloads. + */ + cmp r2, #64 + push {r4} + /* For small sizes < 64 bytes just use the unaligned tail code. */ + blt 16f + ands r3, r0, #3 + beq 11f /* Destination is aligned but source is not. */ + /* Align the destination. */ + cmp r3, #2 + ldrbne r4, [r1], #1 + subne r2, r2, #1 + strbne r4, [r0], #1 + ldrhle r4, [r1], #2 + suble r2, r2, #2 + strhle r4, [r0], #2 + tst r1, #3 + beq 1b /* Destination and source are now aligned. */ + /* Destination is now aligned to a word boundary. */ +11: + cmp r2, #64 + /* + * Jump to non-aligned NEON tail code for <= 64 bytes. + */ + ble 16f + subs r2, r2, #\line_size + + /* Align destination to a 32-byte boundary. */ + ands r4, r0, #31 + rsb r4, r4, #32 + beq 20f + tst r4, #4 + sub r2, r2, r4 + ldrne r3, [r1 :8], #4 /* Unaligned access. */ + strne r3, [r0 :32], #4 + tst r4, #8 +#ifdef CONFIG_THUMB + /* + * Use conditional NEON instructions when + * available (Thumb2 mode) + */ + vld1ne.8 {d0}, [r1]! + vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]! + cmp r4, #16 + vld1ge.8 {d2, d3}, [r1]! + vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]! +#else + beq 31f + vld1.8 {d0}, [r1]! + vst1.64 {d0}, [r0 NEON_ALIGN(64)]! +31: cmp r4, #16 + blt 32f + vld1.8 {d2, d3}, [r1]! + vst1.64 {d2, d3}, [r0 NEON_ALIGN(128)]! +32: +#endif + cmp r2, #0 + addlt r2, r2, #\line_size + blt 16f +20: + +.if \early_prefetch == 1 + pld [ip, #\line_size] +.endif +.if \prefetch_distance > 0 + /* + * Assume a preload at aligned base + line_size will + * be useful. + */ + push {r5} + mov r4, ip + add r5, r1, #(\prefetch_distance * \line_size) + subs r2, r2, #(\prefetch_distance * \line_size) + bic r3, r5, #(\line_size - 1) + add r4, r4, #(2 * \line_size) + blt 15f + cmp r4, r3 + sub ip, r3, r1 + /* + * "Catch-up" the early preloads (which have been performed up + * to aligned source address + line_size) to the preload offset + * used in the main loop. + */ + bge 13f +12: adds r4, r4, #\line_size /* Thumb16 */ + cmp r4, r3 + pld [r4, #(- \line_size)] + blt 12b +.endif + +13: + /* + * Process 64 unaligned bytes from source at a time and copy + * them to the 32-byte aligned destination. + */ +14: +.if \prefetch_distance > 0 + pld [r1, ip] +.endif +15: +.if \line_size == 32 + vld1.8 {d0-d3}, [r1]! + subs r2, r2, #32 + vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! +.else /* line_size == 64 */ + vld1.8 {d0-d3}, [r1]! + vld1.8 {d4-d7}, [r1]! + vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! + subs r2, r2, #64 + vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]! +.endif + bge 14b +.if \prefetch_distance > 0 + cmn r2, #(\prefetch_distance * \line_size) + bge 15b +.endif + /* Correct the count. */ + adds r2, r2, #((\prefetch_distance + 1) * \line_size) +.if \prefetch_distance > 0 + pop {r5} +.endif + + /* + * Handle last 0-(line_size - 1) bytes (destination 32-byte + * aligned source unaligned). + */ +#ifdef CONFIG_THUMB + /* + * Use conditional NEON instructions when + * available (Thumb2 mode) + */ +.if \line_size == 64 + cmp r2, #32 + vld1ge.8 {d0-d3}, [r1]! + vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]! + tst r2, #16 + vld1ne.8 {d0, d1}, [r1]! + vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]! +.else + cmp r2, #16 + vld1ge.8 {d0, d1}, [r1]! + vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]! +.endif + tst r2, #8 + vld1ne.8 {d2}, [r1]! + vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]! + tst r2, #4 + ldrne r3, [r1], #4 + strne r3, [r0 :32], #4 + + pop {r4} + b 8b +#else + /* + * Fall through to the code below. It is not entirely + * optimal because it does not indicate the destination + * is word aligned. + */ +#endif + + /* Handle small size of 0-63 bytes, unaligned. */ +16: bic r3, r2, #7 + rsb r4, r3, #64 + tst r2, #7 + add pc, pc, r4 + nop + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! + vld1.8 {d1}, [r1]! + vst1.8 {d1}, [r0]! + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! + vld1.8 {d1}, [r1]! + vst1.8 {d1}, [r0]! + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! + vld1.8 {d1}, [r1]! + vst1.8 {d1}, [r0]! + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! + vld1.8 {d1}, [r1]! + vst1.8 {d1}, [r0]! + pop {r4} + beq 9b + tst r2, #4 + ldrne r3, [r1 :8], #4 /* Unaligned access. */ + strne r3, [r0], #4 + b 8b + + /* Handle small size of 0-63 bytes, word aligned. */ +17: +#ifdef CONFIG_THUMB + cmp r2, #32 + vld1ge.32 {d0-d3}, [r1]! + vst1ge.32 {d0-d3}, [r0]! + tst r2, #16 + vld1ne.32 {d0, d1}, [r1]! + vst1ne.32 {d0, d1}, [r0]! + tst r2, #8 + vld1ne.32 {d2}, [r1]! + vst1ne.32 {d2}, [r0]! + tst r2, #7 +#else + bic r3, r2, #7 + rsb r4, r3, #64 + tst r2, #7 + add pc, pc, r4 + nop + vld1.32 {d0}, [r1]! + vst1.32 {d0}, [r0]! + vld1.32 {d1}, [r1]! + vst1.32 {d1}, [r0]! + vld1.32 {d0}, [r1]! + vst1.32 {d0}, [r0]! + vld1.32 {d1}, [r1]! + vst1.32 {d1}, [r0]! + vld1.32 {d0}, [r1]! + vst1.32 {d0}, [r0]! + vld1.32 {d1}, [r1]! + vst1.32 {d1}, [r0]! + vld1.32 {d0}, [r1]! + vst1.32 {d0}, [r0]! + vld1.32 {d1}, [r1]! + vst1.32 {d1}, [r0]! +#endif + pop {r4} + beq 9b + tst r2, #4 + ldrne r3, [r1], #4 + strne r3, [r0], #4 + b 8b + + /* + * Fast path for <= 256 bytes, word aligned. + * This is hardcoded for a preload offset of 128 bytes, + * which seems to work well in practice for small sizes. + */ +18: bics r3, r2, #31 +.if \early_prefetch == 1 + pld [ip, #32] + beq 21f + pld [ip, #64] + pld [ip, #96] +.endif + rsb r4, r3, #256 + ands r2, r2, #31 + /* + * Each code block handling 32 bytes is + * 12 bytes long. + */ + lsr r4, r4, #2 + add ip, ip, #128 + add r4, r4, r4, lsr #1 + sub ip, ip, r1 + add pc, pc, r4 + nop + pld [r1, ip] + vld1.32 {d0-d3}, [r1]! + vst1.32 {d0-d3}, [r0]! + pld [r1, ip] + vld1.32 {d4-d7}, [r1]! + vst1.32 {d4-d7}, [r0]! + pld [r1, ip] + vld1.32 {d0-d3}, [r1]! + vst1.32 {d0-d3}, [r0]! + pld [r1, ip] + vld1.32 {d4-d7}, [r1]! + vst1.32 {d4-d7}, [r0]! + pld [r1, ip] + vld1.32 {d0-d3}, [r1]! + vst1.32 {d0-d3}, [r0]! + W(nop) + vld1.32 {d4-d7}, [r1]! + vst1.32 {d4-d7}, [r0]! + W(nop) + vld1.32 {d0-d3}, [r1]! + vst1.32 {d0-d3}, [r0]! + W(nop) + vld1.32 {d4-d7}, [r1]! + vst1.32 {d4-d7}, [r0]! + beq 19f +21: +#ifdef CONFIG_THUMB + cmp r2, #16 + vld1ge.32 {d0-d1}, [r1]! + vst1ge.32 {d0-d1}, [r0]! + tst r2, #8 + vld1ne.32 {d0}, [r1]! + vst1ne.32 {d0}, [r0]! +#else + cmp r2, #16 + ldmiage r1!, {r3, r4} + stmiage r0!, {r3, r4} + ldmiage r1!, {r3, r4} + stmiage r0!, {r3, r4} + tst r2, #8 + ldmiane r1!, {r3, r4} + stmiane r0!, {r3, r4} +#endif + tst r2, #4 + pop {r4} + ldrne r3, [r1], #4 + strne r3, [r0 :32], #4 + and r2, r2, #3 + b 8b +19: + pop {r4} +.if \prefetch_distance > 0 || \early_prefetch == 1 + pop {r0} +.else + mov r0, ip +.endif + bx lr +.endm + + +#if defined(MEMCPY_REPLACEMENT_RPI) || defined(MEMCPY_REPLACEMENT_ARMV7_32) \ +|| defined(MEMCPY_REPLACEMENT_ARMV7_64) || defined(MEMCPY_REPLACEMENT_NEON_32) \ +|| defined(MEMCPY_REPLACEMENT_NEON_64) + +#ifdef MEMCPY_REPLACEMENT_RPI +asm_function memcpy + memcpy_variant 32, 3, 8, 0 +.endfunc +#endif + +#ifdef MEMCPY_REPLACEMENT_ARMV7_32 +asm_function memcpy + memcpy_variant 32, 6, 0, 0 +.endfunc +#endif + +#ifdef MEMCPY_REPLACEMENT_ARMV7_64 +asm_function memcpy + memcpy_variant 64, 3, 0, 0 +.endfunc +#endif + +#ifdef MEMCPY_REPLACEMENT_NEON_32 +asm_function memcpy + neon_memcpy_variant 32, 6, 1 +.endfunc +#endif + +#ifdef MEMCPY_REPLACEMENT_NEON_64 +asm_function memcpy + neon_memcpy_variant 64, 3, 1 +.endfunc +#endif + +#ifdef MEMCPY_REPLACEMENT_NEON_AUTO +asm_function memcpy + neon_memcpy_variant 32, 0, 1 +.endfunc +#endif + +#else + +asm_function memcpy_new_line_size_64_preload_192 + memcpy_variant 64, 3, 0, 0 +.endfunc + +asm_function memcpy_new_line_size_64_preload_192_align_32 + memcpy_variant 64, 3, 32, 0 +.endfunc + +asm_function memcpy_new_line_size_64_preload_192_aligned_access + memcpy_variant 64, 3, 0, 1 +.endfunc + +asm_function memcpy_new_line_size_32_preload_192 + memcpy_variant 32, 6, 0, 0 +.endfunc + +asm_function memcpy_new_line_size_32_preload_192_align_32 + memcpy_variant 32, 6, 32, 0 +.endfunc + +asm_function memcpy_new_line_size_32_preload_96 + memcpy_variant 32, 3, 8, 0 +.endfunc + +asm_function memcpy_new_line_size_32_preload_96_aligned_access + memcpy_variant 32, 3, 8, 1 +.endfunc + +asm_function memcpy_new_neon_line_size_64 + neon_memcpy_variant 64, 3, 1 +.endfunc + +asm_function memcpy_new_neon_line_size_32 + neon_memcpy_variant 32, 6, 1 +.endfunc + +asm_function memcpy_new_neon_line_size_32_auto + neon_memcpy_variant 32, 0, 1 +.endfunc + +#endif + +/* + * Macro for memset replacement. + * write_align must be 0, 8, or 32. + * use_neon must be 0 or 1. + */ + +.macro memset_variant write_align, use_neon +.if \use_neon == 1 + .fpu neon +.endif + ands r3, r0, #3 + mov ip, r0 + bne 7f + + /* Destination is word aligned. */ +1: orr r1, r1, r1, lsl #8 +.if \use_neon == 1 + cmp r2, #16 +.else + cmp r2, #8 +.endif + orr r1, r1, r1, lsl #16 +.if \use_neon == 1 + blt 13f + vmov d0, r1, r1 + vmov d1, r1, r1 +.else + blt 5f + mov r3, r1 +.endif + + cmp r2, #64 + push {r4} +.if \use_neon == 1 + blt 10f +.else + ble 10f +.endif +.if \write_align > 0 + ands r4, r0, #(\write_align - 1) +.if \use_neon == 1 +#ifndef CONFIG_THUMB + add r3, r4, #7 +#endif +.endif + /* Let r4 be equal to the number of bytes to align. */ + rsb r4, r4, #\write_align + /* + * At this point r4 contains the number of bytes to align + * if eq is not set. The eq flag is set if there are no bytes + * to align. + */ +.if \write_align == 8 + subne r2, r2, r4 + strne r1, [r0], #4 +.elseif \write_align == 32 + beq 2f + tst r4, #4 + sub r2, r2, r4 + strne r1, [r0], #4 +.if \use_neon == 1 +#ifdef CONFIG_THUMB + tst r4, #8 + vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]! + cmp r4, #16 + vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]! +#else + bic r4, r3, #7 + lsr r4, r4, #1 + add pc, pc, r4 + nop + vst1.64 {d0}, [r0 NEON_ALIGN(64)]! + vst1.64 {d0}, [r0 NEON_ALIGN(64)]! + vst1.64 {d0}, [r0 NEON_ALIGN(64)]! + vst1.64 {d0}, [r0 NEON_ALIGN(64)]! +#endif +.else + tst r4, #8 + stmiane r0!, {r1, r3} + cmp r4, #16 + stmiage r0!, {r1, r3} + stmiage r0!, {r1, r3} +.endif +.endif /* \write_align == 32 */ + cmp r2, #64 + blt 4f +.endif /* \write_align > 0 */ + +2: +.if \use_neon == 1 + /* + * When NEON is enabled, \write_align is + * equal to 32 so specify 256-bit alignment in the + * NEON store instructions. + */ + subs r2, r2, #64 + vmov q1, q0 +3: vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! + subs r2, r2, #64 + vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]! + bge 3b + adds r2, r2, #64 +.else + mov r4, r1 + subs r2, r2, #64 + push {r5} + mov r5, r1 + +3: stmia r0!, {r1, r3, r4, r5} + subs r2, r2, #64 /* Thumb16 */ + stmia r0!, {r1, r3, r4, r5} + stmia r0!, {r1, r3, r4, r5} + stmia r0!, {r1, r3, r4, r5} + bge 3b + adds r2, r2, #64 /* Thumb16 */ + + pop {r5} +.endif + /* Early exit if there are 0 bytes left. */ +/* THUMB( cbz r2, 9f ) */ +THUMB( cmp r2, #0 ) +THUMB( beq 9f ) +ARM( teq r2, #0 ) +ARM( beq 9f ) + /* + * Handle 8-64 bytes (or 16-63 bytes in case of NEON). + * In case of NEON, destination must be 8-byte aligned. + */ +4: +.if \use_neon == 1 +#ifdef CONFIG_THUMB + vmov q1, q0 + cmp r2, #32 + vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(64)]! + tst r2, #16 + vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(64)]! + tst r2, #8 + vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]! + and r2, r2, #7 +#else + bic r4, r2, #15 + subs r2, r2, r4 + rsb r4, r4, #64 + /* + * When using NEON, the vst instruction + * (storing 16 bytes) is always 32-bit. + */ + lsr r4, r4, #2 + add pc, pc, r4 + nop + vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]! + vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]! + vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]! + vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]! + cmp r2, #8 + strge r1, [r0], #4 + strge r1, [r0], #4 + subge r2, r2, #8 +#endif +.else /* use_neon == 0 */ + bic r4, r2, #7 + subs r2, r2, r4 + rsb r4, r4, #64 + /* + * The stmia instruction (storing 8 bytes) is 32-bit for ARM, + * 16-bit for Thumb2. + */ +THUMB( lsrs r4, r4, #2 ) +ARM( lsr r4, r4, #1 ) + add pc, pc, r4 + nop + stmia r0!, {r1, r3} + stmia r0!, {r1, r3} + stmia r0!, {r1, r3} + stmia r0!, {r1, r3} + stmia r0!, {r1, r3} + stmia r0!, {r1, r3} + stmia r0!, {r1, r3} + stmia r0!, {r1, r3} +.endif +14: pop {r4} + +5: cmp r2, #4 + strge r1, [r0], #4 + /* Early exit for multiple of 4 size. */ + ands r2, r2, #3 + moveq r0, ip + bxeq lr + + /* + * At this point there are 1, 2 or 3 bytes, + * and the destination is aligned. + */ +6: cmp r2, #2 + strhge r1, [r0], #2 + strbne r1, [r0] + mov r0, ip + bx lr + +.if \use_neon == 1 + /* 0-15 bytes left, word aligned. */ +13: cmp r2, #8 + strge r1, [r0] + strge r1, [r0, #4] + addge r0, r0, #8 + subge r2, r2, #8 + b 5b +.endif + + /* Unaligned case. */ +7: cmp r2, #4 + blt 8f +#ifdef CONFIG_THUMB +.if \use_neon == 1 + /* + * When Thumb2 is enabled with NEON, use the optimized + * unaligned NEON code path for small sizes. + */ + cmp r2, #64 + blt 11f +.endif +#endif + /* Align the destination. */ + cmp r3, #2 + sub r2, r2, #4 + strble r1, [r0] + strble r1, [r0, #1] + addle r0, r0, #2 + add r2, r2, r3 + strbne r1, [r0], #1 + b 1b + + /* 0 to 3 bytes left. */ +8: cmp r2, #2 + strbge r1, [r0] + strbge r1, [r0, #1] + addge r0, r0, #2 + tst r2, #1 + strbne r1, [r0] + mov r0, ip + bx lr + +9: pop {r4} + mov r0, ip + bx lr + + /* + * Word aligned 8 <= size <= 64 + * (16 <= size <= 63 in case of NEON). + */ +10: + /* Align the destination to an 8 byte boundary. */ + tst r0, #4 + strne r1, [r0], #4 + subne r2, r2, #4 +.if \use_neon == 1 + cmp r2, #16 + poplt {r4} + blt 13b +.else + cmp r2, #8 + blt 14b +.endif + b 4b + +#ifdef CONFIG_THUMB +.if \use_neon == 1 + /* + * Handle 4 <= size <= 63 bytes, unaligned. + * Use unaligned NEON instructions with Thumb2. + */ +11: + orr r1, r1, r1, lsl #8 + tst r2, #8 + orr r1, r1, r1, lsl #16 + vmov d0, r1, r1 + vst1ne.8 {d0}, [r0]! + vmov d1, r1, r1 + tst r2, #16 + vst1ne.8 {d0, d1}, [r0]! + vmov q1, q0 + cmp r2, #32 + and r2, r2, #7 + vst1ge.8 {d0-d3}, [r0]! + cmp r2, #4 + /* The following store is unaligned. */ + strge r1, [r0], #4 + subge r2, r2, #4 + b 8b +.endif +#endif +.endm + +#if defined(MEMSET_REPLACEMENT_RPI) || defined(MEMSET_REPLACEMENT_ARMV7_32) \ +|| defined(MEMSET_REPLACEMENT_ARMV7_64) || defined(MEMSET_REPLACEMENT_NEON_32) \ +|| defined(MEMSET_REPLACEMENT_NEON_64) + +#ifdef MEMSET_REPLACEMENT_RPI +asm_function memset + memset_variant 32, 0 +.endfunc +#endif + +#if defined(MEMSET_REPLACEMENT_ARMV7_32) || defined(MEMSET_REPLACEMENT_ARMV7_64) +asm_function memset + memset_variant 8, 0 +.endfunc +#endif + +#if defined(MEMSET_REPLACEMENT_NEON_32) || defined(MEMSET_REPLACEMENT_NEON_64) +asm_function memset + memset_variant 32, 1 +.endfunc +#endif + +#else + +asm_function memset_new_align_0 + memset_variant 0, 0 +.endfunc + +asm_function memset_new_align_8 + memset_variant 8, 0 +.endfunc + +asm_function memset_new_align_32 + memset_variant 32, 0 +.endfunc + +asm_function memset_neon + memset_variant 32, 1 +.endfunc + +#endif diff --git a/veejay-current/veejay-server/thirdparty/fastarm/new_arm.h b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.h new file mode 100644 index 00000000..4d13699e --- /dev/null +++ b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.h @@ -0,0 +1,35 @@ + +extern void *memcpy_new_line_size_64_preload_192(void *dest, + const void *src, size_t n); + +extern void *memcpy_new_line_size_64_preload_192_align_32(void *dest, + const void *src, size_t n); + +extern void *memcpy_new_line_size_64_preload_192_aligned_access(void *dest, + const void *src, size_t n); + +extern void *memcpy_new_line_size_32_preload_192(void *dest, + const void *src, size_t n); + +extern void *memcpy_new_line_size_32_preload_192_align_32(void *dest, + const void *src, size_t n); + +extern void *memcpy_new_line_size_32_preload_96(void *dest, + const void *src, size_t n); + +extern void *memcpy_new_line_size_32_preload_96_aligned_access(void *dest, + const void *src, size_t n); + +extern void *memcpy_new_neon_line_size_64(void *dest, const void *src, size_t n); + +extern void *memcpy_new_neon_line_size_32(void *dest, const void *src, size_t n); + +extern void *memcpy_new_neon_line_size_32_auto(void *dest, const void *src, size_t n); + +extern void *memset_new_align_0(void *dest, int c, size_t size); + +extern void *memset_new_align_8(void *dest, int c, size_t size); + +extern void *memset_new_align_32(void *dest, int c, size_t size); + +extern void *memset_neon(void *dest, int c, size_t size); diff --git a/veejay-current/veejay-server/veejay/Makefile.am b/veejay-current/veejay-server/veejay/Makefile.am index 7c4f621d..753aad89 100644 --- a/veejay-current/veejay-server/veejay/Makefile.am +++ b/veejay-current/veejay-server/veejay/Makefile.am @@ -66,6 +66,10 @@ if !HAVE_MJPEGTOOLS libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/mjpegtools -lmjpegutils endif +if HAVE_ARM +libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/fastarm -lfastarm +endif + libveejay_la_LDFLAGS += $(SDL_LIBS) $(SDL_TTF_LIBS) $(DIRECTFB_LIBS) $(X_LIBS) $(PTHREAD_LIBS) $(FT_LDFLAGS) $(FT_LIBS) \ $(XML2_LIBS) $(JPEG_LIBS) $(LIBLO_LIBS) $(LIBUNWIND_LIBS) $(GLIB_LIBS) \ $(FFMPEG_LIBS) $(XINERAMA_LIBS) $(MJPEGTOOLS_LIBS) $(LIBPNG_LIBS) \ diff --git a/veejay-current/veejay-server/veejay/veejay.c b/veejay-current/veejay-server/veejay/veejay.c index 0b4a2cef..75b84cd0 100644 --- a/veejay-current/veejay-server/veejay/veejay.c +++ b/veejay-current/veejay-server/veejay/veejay.c @@ -788,7 +788,7 @@ int main(int argc, char **argv) { veejay_free(info); return 0; - } + } print_license();