From 1eb4ff18ebe5ff6989c69afef6bd8da384019be0 Mon Sep 17 00:00:00 2001
From: c0ntrol <nwelburg@gmail.com>
Date: Sat, 13 Feb 2016 18:53:36 +0100
Subject: [PATCH] add fastarm memcpy

---
 veejay-current/veejay-server/configure.ac     |   50 +-
 .../veejay-server/libvjmem/memcpy.c           |  267 +--
 .../veejay-server/libvjmem/vj-x86.c           |    4 -
 .../veejay-server/thirdparty/Makefile.am      |    5 +
 .../thirdparty/fastarm/Makefile.am            |   10 +
 .../veejay-server/thirdparty/fastarm/README   |   97 +
 .../thirdparty/fastarm/new_arm.S              | 1858 +++++++++++++++++
 .../thirdparty/fastarm/new_arm.h              |   35 +
 .../veejay-server/veejay/Makefile.am          |    4 +
 veejay-current/veejay-server/veejay/veejay.c  |    2 +-
 10 files changed, 2172 insertions(+), 160 deletions(-)
 create mode 100644 veejay-current/veejay-server/thirdparty/fastarm/Makefile.am
 create mode 100644 veejay-current/veejay-server/thirdparty/fastarm/README
 create mode 100644 veejay-current/veejay-server/thirdparty/fastarm/new_arm.S
 create mode 100644 veejay-current/veejay-server/thirdparty/fastarm/new_arm.h

diff --git a/veejay-current/veejay-server/configure.ac b/veejay-current/veejay-server/configure.ac
index 99b59584..8fa62cf2 100644
--- a/veejay-current/veejay-server/configure.ac
+++ b/veejay-current/veejay-server/configure.ac
@@ -376,6 +376,27 @@ esac
 
 CFLAGS="$CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES"
 
+if test x$host_alias != x; then
+	dnl Cross compiling
+	AC_MSG_CHECKING(sub-architecture settings)
+	if test x$have_x86cpu = xtrue; then
+		host_mod_cpu=`echo $host_cpu|tr _ -`
+		ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu"
+		AC_MSG_RESULT($ARCHFLAGS)
+	fi
+else
+	AC_MSG_CHECKING(sub-architecture settings)
+	
+	chmod +x $srcdir/cpuinfo.sh
+
+	if test "$arch_target" = "auto"; then
+		TMP=`$srcdir/cpuinfo.sh`
+		ARCHFLAGS=`cat veejay.arch`
+	else
+       		ARCHFLAGS="-mtune=generic"
+	fi
+		AC_MSG_RESULT($ARCHFLAGS)
+fi
 
 dnl ARM architecture detect NEON and set CFLAGS
 if test x$have_arm = xtrue
@@ -392,8 +413,11 @@ then
 	if test $ac_cv_flag_neon = yes ; then
 		AC_DEFINE(HAVE_ARM_NEON,1,[Compiling in NEON support])
 		USER_CFLAGS="-mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad $USER_CFLAGS"
+		FASTARM_CFLAGS="$ARCHFLAGS -Wa,-march=armv7-a -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB"
+
 	else
 		USER_CFLAGS="-march=native -ftree-vectorize $USER_CFLAGS"
+		FASTARM_CFLAGS="$ARCHFLAGS -Wa, -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB"
 	fi
 	
 	if test "x$enable_debug" != "xyes" ; then
@@ -406,6 +430,8 @@ then
 	SUBSAMPLE_CFLAGS="$USER_CFLAGS"
 	VJE_CFLAGS="$USER_CFLAGS"
 	CFLAGS="$USER_CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES"
+
+	AC_SUBST(FASTARM_CFLAGS)
 fi
 
 dnl This flag is used for PROGRAMS not SHARED LIBRARIES.  PIC code is required
@@ -584,28 +610,6 @@ EOF
     fi
 fi
 
-if test x$host_alias != x; then
-	dnl Cross compiling
-	AC_MSG_CHECKING(sub-architecture settings)
-	if test x$have_x86cpu = xtrue; then
-		host_mod_cpu=`echo $host_cpu|tr _ -`
-		ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu"
-		AC_MSG_RESULT($ARCHFLAGS)
-	fi
-else
-	AC_MSG_CHECKING(sub-architecture settings)
-	
-	chmod +x $srcdir/cpuinfo.sh
-
-	if test "$arch_target" = "auto"; then
-		TMP=`$srcdir/cpuinfo.sh`
-		ARCHFLAGS=`cat veejay.arch`
-	else
-       		ARCHFLAGS="-mtune=generic"
-	fi
-		AC_MSG_RESULT($ARCHFLAGS)
-fi
-
 have_mjpegtools=false
 AC_SUBST(MJPEGTOOLS_CFLAGS)
 AC_SUBST(MJPGETOOLS_LIBS)
@@ -1074,6 +1078,7 @@ AM_CONDITIONAL(HAVE_JPEG,test x$have_jpeg = xtrue)
 AM_CONDITIONAL(HAVE_LIBLO,test x$have_liblo = xtrue)
 AM_CONDITIONAL(HAVE_FREETYPE2, test x$have_freetype2 = xtrue)
 AM_CONDITIONAL(HAVE_MJPEGTOOLS, test x$have_mjpegtools = xtrue )
+AM_CONDITIONAL(HAVE_ARM, test x$have_arm = xtrue )
 dnl *********************************************************************
 dnl Check for what warnings we want gcc to use and adjust the CFLAGS
 dnl as needed. This only works for GCC.
@@ -1161,6 +1166,7 @@ fi
 
 AC_CONFIG_FILES([
 thirdparty/Makefile
+thirdparty/fastarm/Makefile
 thirdparty/aclib/Makefile
 thirdparty/bio2jack/Makefile
 thirdparty/libhash/Makefile
diff --git a/veejay-current/veejay-server/libvjmem/memcpy.c b/veejay-current/veejay-server/libvjmem/memcpy.c
index 184a6d18..5e4a9c13 100644
--- a/veejay-current/veejay-server/libvjmem/memcpy.c
+++ b/veejay-current/veejay-server/libvjmem/memcpy.c
@@ -141,6 +141,10 @@
 #include <libvje/vje.h>
 #include <veejay/vj-task.h>
 #include <libavutil/cpu.h>
+#ifdef HAVE_ARM
+#include <fastarm/new_arm.h>
+#endif
+
 #define BUFSIZE 1024
 
 
@@ -157,37 +161,12 @@
 static int selected_best_memcpy = 1;
 static int selected_best_memset = 1;
 
-#ifdef HAVE_POSIX_TIMERS
-static int64_t _x_gettime(void)
+static double get_time()
 {
-	struct timespec tm;
-	return (clock_gettime(CLOCK_THREAD_CPUTIME_ID,&tm) == -1 )
-			? times(NULL)
-			: (int64_t) tm.tv_sec * 1e9 + tm.tv_nsec;
+	struct timespec ts;
+	clock_gettime( CLOCK_REALTIME, &ts );
+	return (double) ts.tv_sec + (double) ts.tv_nsec / 1000000000.0;
 }
-#define rdtsc(x) _x_gettime()
-#elif (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(HAVE_SYS_TIMES_H)
-static int64_t rdtsc(int cpu_flags)
-{
-	int64_t x;
-	if( cpu_flags & AV_CPU_FLAGS_MMX ) {
-			__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
-			return x;
-	} else {
-			return times(NULL);
-	}
-}
-#else
-static uint64_t rdtsc(int cpu_flags)
-{
-#ifdef HAVE_SYS_TIMES_H
-		struct tms tp;
-		return times(&tp);
-#else
-		return clock();
-#endif
-}
-#endif /* HAVE_SYS_TIMES_H */
 
 #if defined(ARCH_X86) || defined (ARCH_X86_64)
 /* for small memory blocks (<256 bytes) this version is faster */
@@ -251,7 +230,6 @@ void	yuyv_plane_clear( size_t len, void *to )
 	if( vj_task_available() ) {
 		uint8_t * t    = (uint8_t*) to;
 		uint8_t *in[4] = { t, NULL,NULL,NULL };
-		int 	strides[4] = { len, 0,0,0 };
 		vj_task_run( in, in, NULL, NULL, 1, (performer_job_routine) &yuyv_plane_clear_job );
 	}
 	else {
@@ -1349,10 +1327,10 @@ static void *memcpy_neon( void *to, const void *from, size_t n )
 
 
 static struct {
-     char                 *name;
-     void               *(*function)(void *to, const void *from, size_t len);
-     uint64_t	   time;
-     uint32_t		cpu_require;
+     char	*name;
+     void	*(*function)(void *to, const void *from, size_t len);
+     double	t;
+     uint32_t cpu_require;
 } memcpy_method[] =
 {
      { NULL, NULL, 0},
@@ -1382,23 +1360,44 @@ static struct {
 #endif  
 #ifdef HAVE_ARM_NEON
 	{ "NEON optimized memcpy()", (void*) memcpy_neon, 0, AV_CPU_FLAG_NEON },
+#endif
+#ifdef HAVE_ARM
+	{ "new mempcy for cortex with line size of 32, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_line_size_32_preload_192,0,0 },
+	{ "new memcpy for cortex with line size of 64, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>" ,(void*) memcpy_new_line_size_64_preload_192, 0, 0 },
+    { "new memcpy for cortex with line size of 64, preload offset of 192, aligned access (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_line_size_64_preload_192_aligned_access, 0, 0 },
+	{ "new memcpy for cortex with line size of 32, preload offset of 192, align 32", (void*) memcpy_new_line_size_32_preload_192_align_32,0,0},
+	{ "new memcpy for cortex with line size of 32, preload offset of 96", (void*) memcpy_new_line_size_32_preload_96,0,0},
+	{ "new memcpy for cortex with line size of 32, preload offset of 96, aligned access", (void*) memcpy_new_line_size_32_preload_96_aligned_access,0,0},
+#endif
+#ifdef HAVE_ARM_NEON
+	{ "new memcpy for cortex using NEON with line size of 32, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_neon_line_size_32,0,AV_CPU_FLAG_NEON},
+	{ "new memcpy for cortex using NEON with line size of 64, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_neon_line_size_64,0,AV_CPU_FLAG_NEON},
+	{ "new mempcy for cortex using NEON with line size of 32, automatic prefetcher (C) Harm Hanemaaijer <fgenfb@yayhoo.com>", (void*) memcpy_new_neon_line_size_32_auto,0,AV_CPU_FLAG_NEON},
 #endif
      { NULL, NULL, 0},
 };
 
 static struct {
-     char                 *name;
-     void                *(*function)(void *to, uint8_t c, size_t len);
-	 uint64_t	    time;
-     uint32_t		cpu_require;
+	char            *name;
+	void            *(*function)(void *to, uint8_t c, size_t len);
+	uint32_t		cpu_require;
+	double			t;
 } memset_method[] =
 {
-     { NULL, NULL, 0,0},
-     { "glibc memset()",            (void*)memset, 0,0},
+	{ NULL, NULL, 0,0},
+	{ "glibc memset()",(void*)memset,0,0},
 #if defined(HAVE_ASM_MMX) || defined(HAVE_ASM_MMX2) || defined(HAVE_ASM_SSE)
-     { "MMX/MMX2/SSE optimized memset()", (void*)   fast_memset, 0, AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2},
+	{ "MMX/MMX2/SSE optimized memset()", (void*) fast_memset,0,AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2 },
 #endif 
-       { NULL, NULL, 0,0},
+#ifdef HAVE_ARM_NEON
+	{ "memset_neon (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_neon,0, AV_CPU_FLAG_NEON },
+#endif
+#ifdef HAVE_ARM
+	{ "memset align 0 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_0,0,0 },
+	{ "memset align 8 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_8,0,0 },
+	{ "memset align 32 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_32,0,0 },
+#endif
+	{ NULL, NULL, 0, 0},
 };
 
 
@@ -1407,10 +1406,10 @@ void	memcpy_report()
 	int i;
 	fprintf(stdout,"SIMD benchmark results:\n");
 	for( i = 1; memset_method[i].name; i ++ ) {
-		fprintf(stdout,"\t%8ld : %s\n",(long) memset_method[i].time,  memset_method[i].name );
+		fprintf(stdout,"\t%g : %s\n",memset_method[i].t,  memset_method[i].name );
 	}
 	for( i = 1; memcpy_method[i].name; i ++ ) {
-		fprintf(stdout,"\t%8ld : %s\n",(long) memcpy_method[i].time,  memcpy_method[i].name );
+		fprintf(stdout,"\t%g : %s\n",memcpy_method[i].t,  memcpy_method[i].name );
 	}
 }
 
@@ -1430,7 +1429,7 @@ char *get_memset_descr()
 
 void find_best_memcpy()
 {
-     uint64_t t;
+     double t;
      char *buf1, *buf2;
      int i, best = 0,k;
      int bufsize = 720 * 576 * 3;
@@ -1445,6 +1444,8 @@ void find_best_memcpy()
 
      int cpu_flags = av_get_cpu_flags();
 	
+	veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memcpy ..." );	
+
      memset(buf1,0, bufsize);
      memset(buf2,0, bufsize);
 
@@ -1454,28 +1455,28 @@ void find_best_memcpy()
 
 	for( i = 1; memcpy_method[i].name; i ++ ) {
 		
-		t = rdtsc(cpu_flags);
+		t = get_time();
 		if( memcpy_method[i].cpu_require && !(cpu_flags & memcpy_method[i].cpu_require ) ) {
-			memcpy_method[i].time = 0;
+			memcpy_method[i].t = 0.0;
 			continue;
 		}
 
 		for( k = 0; k < 128; k ++ ) {
 			memcpy_method[i].function( buf1,buf2, bufsize );
 		}
-		t = rdtsc(cpu_flags) - t;
-		memcpy_method[i].time = t;
+		t = get_time() - t;
+		memcpy_method[i].t = t;
 	}
 
 	for( i = 1; memcpy_method[i].name; i ++ ) {
 		if(best == 0 ) { 
 			best = i;
-		    t = memcpy_method[i].time;	
+		    t = memcpy_method[i].t;	
 			continue;
 		}
 
-		if( memcpy_method[i].time < t && memcpy_method[i].time > 0 ) {
-			t = memcpy_method[i].time;
+		if( memcpy_method[i].t < t && memcpy_method[i].t > 0 ) {
+			t = memcpy_method[i].t;
 			best = i;
 		}
 	}
@@ -1494,53 +1495,55 @@ void find_best_memcpy()
 
 void find_best_memset()
 {
-    uint64_t t;
-    char *buf1, *buf2;
-    int i, best = 0,k;
+	double t;
+	char *buf1, *buf2;
+	int i, best = 0,k;
 	int bufsize = 720 * 576 * 3;
 	int cpu_flags = av_get_cpu_flags();
 	
-     if (!(buf1 = (char*) malloc( bufsize * sizeof(char) )))
-          return;
+	if (!(buf1 = (char*) malloc( bufsize * sizeof(char) )))
+        	return;
+
+	if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) {
+		free( buf1 );
+		return;
+	}
+
+	veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memset..." );
 
-     if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) {
-          free( buf1 );
-          return;
-     }
-	
 	memset( buf1, 0, bufsize * sizeof(char));
 	memset( buf2, 0, bufsize * sizeof(char));
 
-     for (i=1; memset_method[i].name; i++)
-     {
+	for (i=1; memset_method[i].name; i++)
+	{
 		if( memset_method[i].cpu_require && !(cpu_flags & memset_method[i].cpu_require ) ) {
-			memset_method[i].time= 0;
+			memset_method[i].t= 0;
 			continue;
-		}
+	}
 
-        t = rdtsc(cpu_flags);
-		for( k = 0; k < 128; k ++ ) {
-			memset_method[i].function( buf1 , 0 , bufsize );
-		}
-        t = rdtsc(cpu_flags) - t;
-	  
-        memset_method[i].time = t;
+	t = get_time();
+	for( k = 0; k < 128; k ++ ) {
+		memset_method[i].function( buf1 , 0 , bufsize );
+	}
+	t = get_time() - t;
+ 
+	memset_method[i].t = t;
 
-        if (best == 0 || t < memset_method[best].time)
-        	best = i;
-     }
+	if (best == 0 || t < memset_method[best].t)
+		best = i;
+	}	
 
-     if (best) {
-          veejay_memset = memset_method[best].function;
-     } else {
-		  veejay_memset = memset_method[1].function;
-	 }
+	if (best) {
+		veejay_memset = memset_method[best].function;
+	} 
+	else {
+	  veejay_memset = memset_method[1].function;
+	}
 
 	selected_best_memset = best;
 
-	 free( buf1 );
-     free( buf2 );
-
+	free( buf1 );
+	free( buf2 );
 }
 
 static	void	vj_frame_copy_job( void *arg ) {
@@ -1721,122 +1724,118 @@ void	vj_frame_clear1( uint8_t *input, unsigned int val, int size )
 	vj_frame_clear( in, strides, val );
 }
 
-static unsigned long benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
+static double benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
 {
-	uint64_t k;
-	uint64_t stats[c];
+	int k;
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
 
 	for( k = 0; k < c; k ++ )	
 	{
-		uint64_t t = rdtsc(0);
+		double t = get_time();
 		vj_frame_slow_single( source, source, dest, planes[0], planes[1]/2, 0.67f );
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}
 
-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c ;k ++ )
 		sum += stats[k];
 
-	uint64_t best_time = (sum / c );
+	double best_time = (sum / c );
 
-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
-			(float)(bytes /1048576.0f), (best_time/1000.0f));
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
 
 	return best_time;
 }
 
 
-static unsigned long benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
+static double benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
 {
-	uint64_t k;
-	uint64_t stats[c];
+	int k;
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
 
 	for( k = 0; k < c; k ++ )	
 	{
-		uint64_t t = rdtsc(0);
+		uint64_t t = get_time();
 		vj_frame_slow_threaded( source, source, dest, planes[0], planes[1]/2, 0.67f );
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}
 
-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c ;k ++ )
 		sum += stats[k];
 
-	uint64_t best_time = (sum / c );
+	double best_time = (sum / c );
 
-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
-			(float)(bytes /1048576.0f), (best_time/1000.0f));
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
 
 	return best_time;
 }
 
 
-static unsigned long benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes)
+static double benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes)
 {
-	uint64_t k;
-	uint64_t stats[c];
+	int k;
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
 
 	for( k = 0; k < c; k ++ )	
 	{
-		uint64_t t = rdtsc(0);
+		double t = get_time();
 		vj_frame_copyN( source,dest,planes );
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}
 
-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c ;k ++ )
 		sum += stats[k];
 
-	uint64_t best_time = (sum / c );
+	double best_time = (sum / c );
 
-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
-			(float)(bytes /1048576.0f), (best_time/1000.0f));
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
 
 	return best_time;
 }
 
-static unsigned long benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes)
+static double benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes)
 {
-	uint64_t k; int j;
-	uint64_t stats[c];
+	int k; int j;
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
 
 	for( k = 0; k < c; k ++ ) {
-		uint64_t t = rdtsc(0);
+		double t = get_time();
 		for( j = 0; j < 4; j ++ ) {
 			veejay_memcpy( dest[j], source[j], planes[j] );
 		}
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}
 
-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c; k ++ ) 
 		sum += stats[k];
 
-	uint64_t best_time = (sum/c);
+	double best_time = (sum/c);
 	
-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
-			(float)(bytes /1048576.0f), (best_time/1000.0f));
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
 
 	return best_time;
 }
 
-typedef unsigned long (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes);
+typedef double (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes);
 
 
 void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames, uint8_t **dest, uint8_t **source, int *planes )
 {
-	uint32_t N = 8;
-	uint64_t stats[N];	
+	int N = 8;
+	double stats[N];	
 	uint32_t i;
-	uint64_t fastest = 0;
+	double fastest = 0.0;
 	float work_size = (planes[0] + planes[1] + planes[2] + planes[3]) / 1048576.0f;
 
 	veejay_msg(VEEJAY_MSG_INFO, "run test '%s' (%dx) on chunks of %2.2f MB:", str, N, work_size );
@@ -1848,8 +1847,8 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames,
 			fastest = stats[i];
 	}
 	
-	uint64_t sum = 0;
-	uint64_t slowest=fastest;
+	double sum = 0.0;
+	double slowest=fastest;
 	for( i = 0; i < N; i ++ )
 	{
 		if( stats[i] < fastest ) {
@@ -1860,8 +1859,7 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames,
 
 	float average = (sum / N);
 
-	veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %2.4f ms, worst is %2.4f ms, average is %2.4f ms", 
-		str, fastest/1000.0f, slowest/1000.0f, average/1000.0f );
+	veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %g, worst is %g, average is %g",str, fastest, slowest, average );
 }
 
 void benchmark_tasks(int n_tasks, long n_frames, int w, int h)
@@ -1914,16 +1912,19 @@ void	benchmark_veejay(int w, int h)
 	if( h < 64)
 		h = 64;
 
+	veejay_msg(VEEJAY_MSG_INFO, "Starting benchmark %dx%d YUVP 4:2:2 (100 frames)", w,h);
+
 	int n_tasks = task_num_cpus();
-	init_parallel_tasks( n_tasks );
 	char *str2 = getenv( "VEEJAY_MULTITHREAD_TASKS" );
 	if( str2 ) {
 		n_tasks = atoi(str2);
 	}
-	
-	int n_frames = 100;
-	veejay_msg(VEEJAY_MSG_INFO, "Benchmark %dx%d YUVP 4:2:2 (%d frames)", w,h,n_frames);
-	benchmark_tasks( n_tasks, n_frames,w,h );
+
+	veejay_msg(VEEJAY_MSG_INFO, "VEEJAY_MULTITHREAD_TASKS=%d", n_tasks );
+
+	init_parallel_tasks( n_tasks );
+
+	benchmark_tasks( n_tasks,100,w,h );
 }
 
 void	*vj_hmalloc(size_t sze, const char *name)
diff --git a/veejay-current/veejay-server/libvjmem/vj-x86.c b/veejay-current/veejay-server/libvjmem/vj-x86.c
index d6b9970c..2bf92ed6 100644
--- a/veejay-current/veejay-server/libvjmem/vj-x86.c
+++ b/veejay-current/veejay-server/libvjmem/vj-x86.c
@@ -183,10 +183,6 @@ int	vj_mem_threaded_init(int w, int h)
 			num_tasks = n_cpus;
 			if( num_tasks < 1 )
 				num_tasks = 1;
-
-			if( num_tasks > 1 )
-				veejay_msg( VEEJAY_MSG_INFO, "Using %d threads scheduled over %d cpus in performer.", num_tasks, n_cpus-1 );
-
 		}
 		
 	}
diff --git a/veejay-current/veejay-server/thirdparty/Makefile.am b/veejay-current/veejay-server/thirdparty/Makefile.am
index 952e0cdb..71fb877a 100644
--- a/veejay-current/veejay-server/thirdparty/Makefile.am
+++ b/veejay-current/veejay-server/thirdparty/Makefile.am
@@ -3,3 +3,8 @@ SUBDIRS = aclib bio2jack libhash liblzo libOSC libresample
 if !HAVE_MJPEGTOOLS
 SUBDIRS += mjpegtools 
 endif
+
+if HAVE_ARM
+SUBDIRS += fastarm
+endif
+
diff --git a/veejay-current/veejay-server/thirdparty/fastarm/Makefile.am b/veejay-current/veejay-server/thirdparty/fastarm/Makefile.am
new file mode 100644
index 00000000..1381cee1
--- /dev/null
+++ b/veejay-current/veejay-server/thirdparty/fastarm/Makefile.am
@@ -0,0 +1,10 @@
+# Makefile for veejay
+MAINTAINERCLEANFILES = Makefile.in
+AM_CFLAGS = $(FASTARM_CFLAGS)
+AM_CPPFLAGS = -I$(top_srcdir) -I$(includedir) \
+              -I$(top_srcdir)/thirdparty $(FASTARM_CFLAGS) 
+
+FASTARM_LIB_FILE = libfastarm.la
+noinst_LTLIBRARIES = $(FASTARM_LIB_FILE)
+libfastarm_la_SOURCES = new_arm.S
+EXTRA_DIST= 
diff --git a/veejay-current/veejay-server/thirdparty/fastarm/README b/veejay-current/veejay-server/thirdparty/fastarm/README
new file mode 100644
index 00000000..d48eba41
--- /dev/null
+++ b/veejay-current/veejay-server/thirdparty/fastarm/README
@@ -0,0 +1,97 @@
+fastarm
+
+This toolkit contains a set of fast memcpy/memset variants for ARM
+platforms. They either use the standard register file, or optionally
+NEON instructions,
+
+Several basic families of variants are provided; the current ones are
+the "new memcpy" variants which are the default for memcpy replacement,
+which generally do not overfetch beyond the source region and can be
+configured to use unaligned memory access for small sizes, or to use
+strictly aligned memory access. This family can also be configured to
+include a fast path for smaller sizes (this is the default), disabling
+this results in smaller code size at the expense of worse performance
+for small sizes. NEON optimized versions, which are generally faster
+with reduced code size, are also provided.
+
+To compile the benchmark program, run 'make'. This will compile in a
+plethora of variants with different preload strategies, block sizes,
+alignment etc.
+
+A benchmark program to compare various memcpy variants is provided. Try
+something like "./benchmark --memcpy ad --all". (Use --memcpy al on the
+Raspberry Pi platform).
+
+To compile a memcpy replacement library, set PLATFORM to one of the
+values described at the beginning of the Makefile. This selects the
+cache line size to use and whether to use NEON versions.
+
+Optionally disable Thumb2 mode compilation by commenting out the THUMBFLAGS
+definition. It must be disabled on the Raspberry Pi.
+
+Then run:
+
+    sudo make install_memcpy_replacement
+
+The replacement memcpy/memset shared library will be installed into
+/usr/lib/arm-linux-gnueabihf/ as libfastarm.so.
+
+To enable the use of the replacement memcpy in applications, create or edit
+the file /etc/ld.so.preload so that it contains the line:
+
+    /usr/lib/arm-linux-gnueabihf/libfastarm.so
+
+On the RPi platform, references to libcofi_rpi.so should be commented out
+or deleted. The new memcpy should now be activated for newly launched
+programs. To be sure, reboot or run:
+
+    sudo ldconfig
+
+To revert to the default optimized memcpy on the RPi platform,
+edit /etc/ld.so.preload so that it contains the line:
+
+    /usr/lib/arm-linux-gnueabihf/libcofi_rpi.so
+
+instead of the one using libfastarm.so.
+
+Note on cache line size:
+
+Although assuming a preload line size 64 bytes is a little faster on several
+Cortex platforms for small to moderate sizes, when accessing DRAM
+with larger sizes assuming 32 byte preloads seems to be faster. On earlier
+Cortex A9 models, 32 byte preloads are required for good performance in all
+cases.
+
+Notes on performance with and without NEON:
+
+For NEON-based memcpy, a significant benefit is seen on the tested Cortex A8
+platform for unaligned copies in cache memory and for aligned and unaligned
+copies in DRAM. Performance for aligned copies in cache memory is relatively
+similar to the optimized non-NEON function.
+
+Results in MB/s on a Cortex A8, with Thumb2 mode enabled, of
+standard libc (Debian unstable), armv7 and NEON optimized memcpy
+variants with line size of 32 bytes:
+
+		libc	armv7	NEON
+test 0		522	549	567
+test 1		329	377	378
+test 2		434	430	513
+test 28		351	361	458
+test 29		246	248	358
+test 43		467	512	581
+
+Test 0 in the benchmark program tests word-aligned requests with
+sizes that are a power of 2 up to 4096 bytes distributed according
+to a power law.
+Test 1 in the benchmark program tests word-aligned requests with
+sizes up to 1024 that are a multiple of 4, distributed according
+to a power law.
+Test 2 in the benchmark program tests unaligned requests with sizes
+up to 1023 bytes.
+Test 28 in the benchmark program tests word aligned requests in DRAM
+with sizes up to 1024 bytes.
+Test 29 in the benchmark program tests word aligned requests in DRAM
+with sizes up to 256 bytes.
+Test 43 in the benchmark program tests page aligned requests in DRAM
+of size 4096 (copying a memory page).
diff --git a/veejay-current/veejay-server/thirdparty/fastarm/new_arm.S b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.S
new file mode 100644
index 00000000..fd3a3a14
--- /dev/null
+++ b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.S
@@ -0,0 +1,1858 @@
+/*
+ * Copyright 2013 Harm Hanemaaijer <fgenfb@yahoo.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifdef CONFIG_THUMB
+#define W(instr) instr.w
+#define THUMB(instr...)	instr
+#define ARM(instr...)
+#else
+#define W(instr) instr
+#define THUMB(instr...)
+#define ARM(instr...) instr
+#endif
+
+/*
+ * In practice, because the way NEON is configured on most systems,
+ * specifying alignment hints for NEON instructions doesn't seem
+ * to improve performance, or even degrade performance in some cases.
+ * However, actually having the address aligned to an element
+ * boundary or greater is beneficial.
+ */
+#define NEON_ALIGN(n)
+/* #define NEON_ALIGN(n) :n */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.syntax unified
+.arch armv7a
+.fpu neon
+
+.macro asm_function function_name
+    .global \function_name
+.func \function_name
+.type \function_name, function
+ARM(    .p2align 5      )
+THUMB(  .p2align 2      )
+\function_name:
+.endm
+
+/*
+ * The following memcpy implementation is optimized with a fast path
+ * for common, word aligned cases and optionally use unaligned access for
+ * small sizes.
+ *
+ * - line_size is the cache line size used for prefetches. Must be 64 or 32.
+ * - prefetch_distance is the number of cache lines to look ahead and must be
+ *   >= 2.
+ * - write_align is the write alignment enforced before the main loop for larger
+ *   sizes (word aligned case) and must be 0, 16, 32, or 64.
+ * - aligned_access must be 0 or 1. When enabled, no unaligned memory accesses
+ *   will occur. Both small size tresholds for unaligned access are not used
+ *   in this case.
+ */
+
+/* The threshold size for using the fast path for the word-aligned case. */
+#define FAST_PATH_THRESHOLD 256
+/* The threshold size for using the small size path for the word-aligned case. */
+#define SMALL_SIZE_THRESHOLD 15
+/*
+ * The threshold size for using the small size path for the unaligned case.
+ * Unaligned memory accesses will be generated for requests smaller or equal to
+ * this size.
+ */
+#define UNALIGNED_SMALL_SIZE_THRESHOLD 64
+/*
+ * The threshold size for using the small size path when both the source and
+ * the destination are unaligned. Unaligned memory accesses will be generated
+ * for requests smaller of equal to this size.
+ */
+#define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 32
+
+/*
+ * For a code-reduced version, define all four of the above constants to 0,
+ * eliminating the fast path and small size special cases. With Thumb2
+ * enabled, this resulted in a reduction in code size from 1150 to 824 bytes,
+ * at the cost of lower performance for smaller sizes.
+ */
+// #define FAST_PATH_THRESHOLD 0
+// #define SMALL_SIZE_THRESHOLD 0
+// #define UNALIGNED_SMALL_SIZE_THRESHOLD 0
+// #define BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD 0
+
+/*
+ * EARLY_PREFETCHES is used in the fast path implementation.
+ * The optimal value for EARLY_PREFETCHES was determined empirically.
+ * It is equal to prefetch_distance + 1 for line_size 32.
+ *            and prefetch_distance - 1 for line_size 64.
+ */
+#define EARLY_PREFETCHES (\prefetch_distance - (\line_size / 32) * 2 + 3)
+
+#if FAST_PATH_THRESHOLD > 0
+#define FAST_PATH(instr...) instr
+#define NO_FAST_PATH(instr...)
+#else
+#define FAST_PATH(instr...)
+#define NO_FAST_PATH(instr...) instr
+#endif
+
+
+/* Helper macro for the fast-path implementation. */
+
+.macro copy_16_bytes bytes_to_go, line_size, prefetch_distance
+#ifdef CONFIG_THUMB
+                /*
+                 * When Thumb2 mode is enabled, the ldmia/stmia instructions
+                 * will be 16-bit, and the preload instruction will be
+                 * 32-bit, so we only need one 32-bit wide nop instruction
+                 * when there's no preload, for a total size of two words.
+                 */
+.if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \
+ (\bytes_to_go % \line_size) == 0
+        	pld     [r1, ip]
+                ldmia   r1!, {r3, r4, r5, r6}
+                stmia   r0!, {r3, r4, r5, r6}
+.else
+                ldmia   r1!, {r3, r4, r5, r6}
+        W(	nop	)
+                stmia   r0!, {r3, r4, r5, r6}
+.endif
+#else
+                /*
+                 * When ARM mode is enabled, every instruction is one word,
+                 * so make sure the entire block is four instructions.
+                 */
+.if \bytes_to_go >= (EARLY_PREFETCHES * \line_size) && \
+(\bytes_to_go % \line_size) == 0
+       		pld     [r1, ip]
+.else
+                nop
+.endif
+                ldmia   r1!, {r3, r4, r5, r6}
+                nop
+                stmia   r0!, {r3, r4, r5, r6}
+#endif
+.endm
+
+
+/* Helper macro implementing unaligned copy. */
+
+.macro unaligned_copy shift, line_size, prefetch_distance, write_align, \
+aligned_access
+		/*
+		 * ip is the aligned source base address.
+		 * r3 is a word of data from the source.
+		 */
+.if \write_align > 0
+		cmp	r2, #(32 + \write_align - 4)
+.else
+		cmp	r2, #32
+.endif
+		push 	{r5}
+		blt	55f
+		subs	r2, r2, #32
+
+		/* Handle write alignment. */
+.if \write_align > 0
+.if \write_align == 8
+		tst	r0, #4
+	        mov	r4, r3, lsr #\shift
+                ldrne	r3, [r1], #4
+		subne	r2, r2, #4
+                orrne   r4, r4, r3, lsl #(32 - \shift)
+                strne   r4, [r0], #4
+.else
+		ands	r5, r0, #(\write_align - 1)
+		rsb	r5, r5, #\write_align
+		beq	59f
+		sub	r2, r2, r5
+
+58:             movs    r4, r3, lsr #\shift
+                ldr	r3, [r1], #4
+                subs    r5, r5, #4
+                orr     r4, r4, r3, lsl #(32 - \shift)
+                str     r4, [r0], #4
+                bgt     58b
+59:
+.endif
+.endif
+
+                /*
+                 * Assume a preload at aligned base + line_size will
+                 * be useful.
+                 */
+		pld     [ip, #\line_size]
+		push	{r6-r11}
+		mov	r11, r3
+
+		mov	r4, ip
+                add     r5, r1, #(\prefetch_distance * \line_size)
+                subs    r2, r2, #(\prefetch_distance * \line_size)
+                bic     r3, r5, #31
+                add     r4, r4, #(2 * \line_size)
+                blt     54f
+                cmp     r4, r3
+                sub     ip, r3, r1
+                /*
+                 * "Catch-up" the early preloads (which have been performed up
+                 * to aligned source address + line_size) to the preload offset
+                 * used in the main loop.
+                 */
+                bge     52f
+51:             adds    r4, r4, #\line_size		/* Thumb16 */
+                cmp     r4, r3
+                pld     [r4, #(- \line_size)]
+                blt     51b
+52:
+		/*
+		 * Note that when L1_CACHE_BYTES is 64, we are
+		 * prefetching every 32 bytes. Although not optimal
+		 * there doesn't seem to be big penalty for the extra
+		 * preload instructions and it prevents greater
+		 * code size and complexity.
+		 */
+53:		pld	[r1, ip]
+54:
+		ldmia	r1!, {r4-r7}
+		mov	r3, r11, lsr #\shift
+		ldmia	r1!, {r8-r11}
+		orr	r3, r3, r4, lsl #(32 - \shift)
+		movs	r4, r4, lsr #\shift	/* Thumb16 */
+		orr	r4, r4, r5, lsl #(32 - \shift)
+		movs	r5, r5, lsr #\shift	/* Thumb16 */
+		orr	r5, r5, r6, lsl #(32 - \shift)
+		movs	r6, r6, lsr #\shift	/* Thumb16 */
+		orr	r6, r6, r7, lsl #(32 - \shift)
+		movs	r7, r7, lsr #\shift	/* Thumb16 */
+		orr	r7, r7, r8, lsl #(32 - \shift)
+		mov	r8, r8, lsr #\shift
+		orr	r8, r8, r9, lsl #(32 - \shift)
+		mov	r9, r9, lsr #\shift
+		orr	r9, r9, r10, lsl #(32 - \shift)
+		mov	r10, r10, lsr #\shift
+		orr	r10, r10, r11, lsl #(32 - \shift)
+		subs	r2, r2, #32
+		stmia	r0!, {r3-r10}
+		bge	53b
+		cmn	r2, #(\prefetch_distance * \line_size)
+		bge	54b
+		/* Correct the count. */
+		adds	r2, r2, #(\prefetch_distance * \line_size + 32)
+
+		mov	r3, r11
+		pop	{r6-r11}
+
+55:		bics	r5, r2, #3
+		beq	57f
+
+56:             movs    r4, r3, lsr #\shift
+                ldr	r3, [r1], #4
+                subs    r5, r5, #4
+                orr     r4, r4, r3, lsl #(32 - \shift)
+                str     r4, [r0], #4
+                bgt     56b
+
+57:		pop	{r5}
+		pop	{r4}
+		subs	r1, r1, #((32 - \shift) / 8)
+.if \aligned_access == 1
+		b	7b
+.else
+		b	3b
+.endif
+.endm
+
+
+/* The main memcpy function macro. */
+
+.macro memcpy_variant line_size, prefetch_distance, write_align, \
+aligned_access
+
+.if \aligned_access == 1
+		cmp	r2, #3
+.else
+NO_FAST_PATH(	cmp	r2, #3	)
+.endif
+		orr	r3, r0, r1
+.if \aligned_access == 1
+		push	{r0}
+		ble	7f
+.else
+NO_FAST_PATH(	push	{r0}	)
+NO_FAST_PATH(	ble	3f	)
+.endif
+		bic	ip, r1, #(\line_size - 1)
+		tst	r3, #3
+		pld	[ip]
+.if \aligned_access == 1
+FAST_PATH(	bne	30f	)
+.else
+FAST_PATH(	push	{r0}	)
+FAST_PATH(	bne	7f	)	/* Unaligned source or destination. */
+.endif
+FAST_PATH(	cmp	r2, #FAST_PATH_THRESHOLD )
+FAST_PATH(	bgt     10f	)
+NO_FAST_PATH(	bne	30f	)
+#if FAST_PATH_THRESHOLD == 0
+		/*
+		 * When the fast path is disabled, check whether there are
+		 * enough bytes for alignment, and jump to the main handling
+		 * code for larger sizes.
+		 */
+.if \write_align > 0
+ 		cmp	r2, #(\write_align - 4)
+		bge	10f
+.endif
+		push	{r4}
+		b	18f
+#endif
+
+		/*
+		 * Fast path for aligned copies of size <= FAST_PATH_THRESHOLD.
+		 */
+#if FAST_PATH_THRESHOLD > 0
+#if SMALL_SIZE_THRESHOLD == 15
+                bics    r3, r2, #15
+                pld     [ip, #\line_size]
+		/* Jump for small sizes <= 15 bytes. */
+		beq	5f
+#else
+		cmp	r2, #SMALL_SIZE_THRESHOLD
+                pld     [ip, #\line_size]
+		/* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */
+		ble	5f
+		bic	r3, r2, #15
+#endif
+
+9:		/*
+		 * This is the entry-point into the fast path from
+		 * an unaligned request that has been aligned.
+		 */
+		push	{r4, r5, r6}
+
+                /*
+                 * Use a heuristic to determine whether the preload
+                 * at aligned_base + 2 * line_size will be useful.
+                 */
+.if EARLY_PREFETCHES >= 3
+                cmp     r2, #(2 * \line_size - \line_size / 2)
+.endif
+                add     r5, ip, #(EARLY_PREFETCHES * \line_size)
+.if EARLY_PREFETCHES >= 3
+                blt     1f
+.endif
+.if EARLY_PREFETCHES == 3
+                pld     [ip, #(2 * \line_size)] )
+.endif
+.if EARLY_PREFETCHES == 4
+                cmp     r2, #(3 * \line_size - \line_size / 2)
+                pld     [ip, #(2 * \line_size)]
+                blt     1f
+                pld     [ip, #(3 * \line_size)]
+.endif
+.if EARLY_PREFETCHES == 5
+                cmp     r2, #(3 * \line_size - \line_size / 2)
+                pld     [ip, #(2 * \line_size)]
+                blt     1f
+                cmp     r2, #(4 * \line_size - \line_size / 2)
+                pld     [ip, #(3 * \line_size)]
+                blt     1f
+                pld     [ip, #(4 * \line_size)]
+.endif
+
+1:              /*
+                 * Set r5 so that the next preload will occur
+                 * exactly at aligned_base + EARLY_PREFETCHES *
+                 * line_size. For example, if line_size is 64
+                 * and the number of bytes is 240, the next preload
+                 * will occur after processing 48 bytes, which is derived
+                 * from the formula r3 & (line_size - 1),
+                 * where r3 is equal to number_of_bytes & (~15).
+                 */
+                rsb     r4, r3, #256
+        	subs    r5, r5, r1
+        	and     ip, r3, #(\line_size - 1)
+                subs    r2, r2, r3		/* Thumb16 */
+THUMB(		lsrs    r4, r4, #1	)	/* Thumb16 */
+        	sub     ip, r5, ip
+                add     pc, pc, r4
+                nop
+                /* >= 256 bytes to go. */
+                copy_16_bytes 256, \line_size, \prefetch_distance
+                /* >= 240 bytes go. */
+                copy_16_bytes 240, \line_size, \prefetch_distance
+                /* >= 224 bytes to go. */
+                copy_16_bytes 224, \line_size, \prefetch_distance
+                /* >= 204 bytes go. */
+                copy_16_bytes 204, \line_size, \prefetch_distance
+                /* >= 192 bytes to go. */
+                copy_16_bytes 192, \line_size, \prefetch_distance
+                /* >= 176 bytes go. */
+                copy_16_bytes 176, \line_size, \prefetch_distance
+                /* >= 160 bytes to go. */
+                copy_16_bytes 160, \line_size, \prefetch_distance
+                /* >= 144 bytes go. */
+                copy_16_bytes 144, \line_size, \prefetch_distance
+                /* >= 128 bytes to go. */
+                copy_16_bytes 128, \line_size, \prefetch_distance
+                /* >= 112 bytes go. */
+                copy_16_bytes 112, \line_size, \prefetch_distance
+                /* >= 96 bytes to go. */
+                copy_16_bytes 96, \line_size, \prefetch_distance
+                /* >= 80 bytes to go. */
+                copy_16_bytes 80, \line_size, \prefetch_distance
+                /* >= 64 bytes to go. */
+                copy_16_bytes 64, \line_size, \prefetch_distance
+                /* >= 48 bytes to go. */
+                copy_16_bytes 48, \line_size, \prefetch_distance
+                /* >= 32 bytes to go. */
+                copy_16_bytes 32, \line_size, \prefetch_distance
+                /* At this point there are 16 to 31 bytes to go. */
+                tst     r2, #15
+                ldmia   r1!, {r3, r4, r5, r6}
+                cmpne   r2, #8
+                /*
+                 * If r2 == 8, we need to clear the eq flag while
+                 * making sure carry remains set.
+                 */
+                tsteq   r2, #15
+                stmia   r0!, {r3, r4, r5, r6}
+                /*
+                 * The equal flag is set if there are no bytes left.
+                 * The carry flag is set is there are >= 8 bytes left.
+                 */
+		pop	{r4, r5, r6}
+                beq     4f
+
+2:
+		/*
+		 * ARM mode imposes restrictions on the registers used
+		 * in double-word loads and stored so we have to use
+		 * single-word operations.
+		 */
+.if \aligned_access == 0
+	ARM(	ldrcs	r3, [r1], #4	)
+	ARM(	ldrcs	ip, [r1], #4	)
+	ARM(	strcs	r3, [r0], #4	)
+	ARM(	strcs	ip, [r0], #4	)
+	THUMB(	ldrdcs  r3, ip, [r1], #8	)
+	THUMB(	strdcs  r3, ip, [r0], #8	)
+.else
+		ldrcs	r3, [r1], #4
+		ldrcs	ip, [r1], #4
+		strcs	r3, [r0], #4
+		strcs	ip, [r0], #4
+.endif
+                tst     r2, #4
+                ldrne   ip, [r1], #4
+                strne   ip, [r0], #4
+                tst     r2, #3
+                popeq	{r0}
+                bxeq	lr
+
+	        /*
+		 * Handle the last up to three bytes. Unaligned access
+		 * make take place if source or destination is not
+		 * half-word aligned.
+		 */
+3:		movs    r2, r2, lsl #31
+                ldrhcs  r3, [r1], #2
+                strhcs  r3, [r0], #2
+                ldrbne  r3, [r1], #1
+                strbne  r3, [r0], #1
+4:		pop	{r0}
+		bx	lr
+
+5:		/*
+		 * Sizes <= SMALL_SIZE_THRESHOLD bytes, both source and
+		 * destination aligned.
+		 */
+#if SMALL_SIZE_THRESHOLD <= 15
+		cmp	r2, #8		/* cs if r2 >= 8. */
+		b	2b
+#else
+101:		tst	r2, #4
+		ldrne	r3, [r1], #4
+		subne	r2, r2, #4
+		strne	r3, [r0], #4
+		cmp	r2, #8
+		blt	3b
+6:		cmp	r2, #16
+		ldr	r3, [r1], #4
+		ldr	ip, [r1], #4
+		str	r3, [r0], #4
+		sub	r2, r2, #8
+		str	ip, [r0], #4
+		bge	6b
+		cmp	r2, #0
+		popeq	{r0}
+		bxeq	lr
+		b	3b
+#endif
+
+#endif	/* FAST_PATH_THRESHOLD > 0 */
+
+.if \aligned_access == 1
+		/*
+		 * Handle the last up to three bytes avoiding
+		 * unaligned memory access.
+		 */
+7:		movs    r2, r2, lsl #31
+                ldrbcs  r3, [r1], #1
+                ldrbcs  ip, [r1], #1
+                strbcs  r3, [r0], #1
+                strbcs  ip, [r0], #1
+                ldrbne  r3, [r1], #1
+                strbne  r3, [r0], #1
+		pop	{r0}
+		bx	lr
+.endif
+
+#if FAST_PATH_THRESHOLD > 0
+.if \aligned_access == 0
+7:		/*
+		 * Unaligned source or destination. There are seperate small
+		 * size thresholds for when both source and destination are
+		 * unaligned and the other case.
+		 */
+		tst	r0, #3
+		mov	r3, #UNALIGNED_SMALL_SIZE_THRESHOLD
+		tstne	r1, #3
+		movne	r3, #BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD
+		cmp	r2, r3
+		bgt	30f
+
+		/* Small sizes, unaligned case. Use single word load/stores. */
+#if SMALL_SIZE_THRESHOLD >= 16
+		/* Use the identical code path already defined above. */
+		b	101b
+#else
+		tst	r2, #4
+		ldrne	r3, [r1], #4
+		subne	r2, r2, #4
+		strne	r3, [r0], #4
+		cmp	r2, #8
+		blt	3b
+8:		cmp	r2, #16
+		ldr	r3, [r1], #4
+		ldr	ip, [r1], #4
+		str	r3, [r0], #4
+		sub	r2, r2, #8
+		str	ip, [r0], #4
+		bge	8b
+		b	3b
+#endif
+.endif
+#endif		/* FAST_PATH_THRESHOLD > 0 */
+
+10:		/*
+		 * This is the start of the handling of larger sizes for
+		 * aligned copies.
+		 *
+		 * Size > FAST_PATH_THRESHOLD (256).
+		 * ip is the line_sized aligned source address for preloads.
+		 */
+
+.if \write_align >= 16
+		ands	r3, r0, #(\write_align - 1)
+		push	{r4}
+		rsb	r3, r3, #\write_align
+		beq	17f
+		push	{lr}
+		bl	20f
+		pop	{lr}
+17:
+.elseif \write_align == 8
+		/*
+		 * For write alignment of 8, it is quickest to do a simple
+		 * conditional load/store.
+		 */
+		tst	r0, #4
+		push	{r4}
+		ldrne	r3, [r1], #4
+		subne	r2, r2, #4
+		strne	r3, [r0], #4
+.else
+		push	{r4}
+.endif
+
+18:
+.if (FAST_PATH_THRESHOLD - (\write_align - 4)) < \line_size
+		cmp	r2, #\line_size
+		blt	15f
+.endif
+		subs	r2, r2, #\line_size
+
+16:		/*
+		 * This is the entry-point when source and destination were
+		 * initially unaligned but are now aligned because they had
+		 * the same alignment within a word. Write alignment and
+		 * size check has already been handled.
+		 */
+
+		push	{r5-r11}
+
+                /*
+                 * Assume a preload at aligned base + line_size will
+                 * be useful.
+                 */
+		mov	r4, ip
+		pld     [ip, #\line_size]
+                add     r5, r1, #(\prefetch_distance * \line_size)
+                subs    r2, r2, #(\prefetch_distance * \line_size)
+                bic     r3, r5, #(\line_size - 1)
+                add     r4, r4, #(2 * \line_size)
+                blt     14f
+                cmp     r4, r3
+                sub     ip, r3, r1
+                /*
+                 * "Catch-up" the early preloads (which have been performed up
+                 * to aligned source address + line_size) to the preload offset
+                 * used in the main loop.
+                 */
+                bge     12f
+11:             adds    r4, r4, #\line_size		/* Thumb16 */
+                cmp     r4, r3
+                pld     [r4, #(- \line_size)]
+                blt     11b
+12:
+
+		/*
+		 * The main loop for large sizes. Copy 32 bytes at a time
+		 * using ldmia/stmia while prefetching a 32-byte aligned
+		 * address for line size 32, or 64 bytes at a time while
+		 * prefetching a 64-byte aligned address for line size 64.
+		 */
+13:		pld     [r1, ip]
+14:
+.if \line_size == 32
+		ldmia   r1!, {r4-r7}
+		subs    r2, r2, #32
+		ldmia   r1!, {r8-r11}
+		stmia   r0!, {r4-r7}
+		stmia   r0!, {r8-r11}
+.else
+		ldmia   r1!, {r4-r11}
+		subs    r2, r2, #64
+		stmia   r0!, {r4-r11}
+		ldmia   r1!, {r4-r11}
+		stmia   r0!, {r4-r11}
+.endif
+		bge	13b
+		cmn	r2, #(\prefetch_distance * \line_size)
+		bge	14b
+		/* Correct the count. */
+		adds	r2, r2, #((\prefetch_distance + 1) * \line_size)
+		pop	{r5-r11}
+
+15:		ands	r3, r2, #60
+.if \write_align <= 8
+		/*
+		 * When the subroutine is not used for write alignment, the
+		 * subroutine will only be called once, so branch without
+		 * linking.
+		 */
+		bne	20f
+19:
+.else
+		mov	ip, lr
+		blne	20f
+		mov	lr, ip
+.endif
+		pop	{r4}
+#if FAST_PATH_THRESHOLD > 0
+		cmp	r2, #0
+		bne	3b
+#else
+	ARM(	cmp	r2, #0	)
+	ARM(	beq	4f	)
+	THUMB(	cbz	r2, 4f	)
+	        /* Handle the last up to three bytes. */
+3:		movs    r2, r2, lsl #31
+                ldrhcs  r3, [r1], #2
+                strhcs  r3, [r0], #2
+                ldrbne  r3, [r1], #1
+                strbne  r3, [r0], #1
+4:
+#endif
+		pop	{r0}
+		bx	lr
+
+                /*
+                 * Subroutine that copies a multiple of 4 bytes of size
+                 * r3 from 0 to 64 or 32 bytes. r2 is decremented by the
+		 * number of bytes copied.
+                 */
+20:		tst     r3, #4
+                sub     r2, r2, r3
+                ldrne   r4, [r1], #4
+                subne   r3, r3, #4
+                strne   r4, [r0], #4
+.if \write_align <= 32 && \line_size == 32
+                rsb     r3, r3, #32
+.else
+                rsb     r3, r3, #64
+.endif
+		/*
+		 * These ldmia/stmia instructions are 16-bit on Thumb2,
+		 * 32-bit on ARM.
+		 */
+	THUMB(	lsrs    r3, r3, #1	)
+                add     pc, pc, r3
+                nop
+                ldmia   r1!, {r3, r4}
+                stmia   r0!, {r3, r4}
+                ldmia   r1!, {r3, r4}
+                stmia   r0!, {r3, r4}
+                ldmia   r1!, {r3, r4}
+                stmia   r0!, {r3, r4}
+                ldmia   r1!, {r3, r4}
+                stmia   r0!, {r3, r4}
+.if \write_align > 32 || \line_size > 32
+                ldmia   r1!, {r3, r4}
+                stmia   r0!, {r3, r4}
+                ldmia   r1!, {r3, r4}
+                stmia   r0!, {r3, r4}
+                ldmia   r1!, {r3, r4}
+                stmia   r0!, {r3, r4}
+                ldmia   r1!, {r3, r4}
+                stmia   r0!, {r3, r4}
+.endif
+.if \write_align <= 8
+		b	19b
+.else
+		mov	pc, lr
+.endif
+
+30:		/*
+		 * Unaligned case. Align the destination.
+		 * Number of bytes is > UNALIGNED_SMALL_SIZE_THRESHOLD.
+		 * Note: This may use unaligned access.
+		 * ip is the line_size aligned source address for preloads.
+		 */
+		ands	r3, r0, #3
+		push	{r4}
+		andeq	r3, r1, #3
+		beq	40f	/* Destination is aligned but source is not. */
+		/* Align the destination. */
+		cmp	r3, #2
+.if \aligned_access == 1
+                ldrble  r4, [r1], #1
+		ldrble	r3, [r1], #1
+		suble	r2, r2, #2
+		strble	r4, [r0], #1
+		strble	r3, [r0], #1
+.else
+                ldrhle  r4, [r1], #2
+		suble	r2, r2, #2
+		strhle	r4, [r0], #2
+.endif
+		ldrbne	r4, [r1], #1
+		subne	r2, r2, #1
+		strbne	r4, [r0], #1
+		ands	r3, r1, #3
+		bne	40f	/* Destination is aligned but source is not. */
+
+#if 0 && FAST_PATH_THRESHOLD > 0
+		/*
+		 * Source and destination are now aligned.
+		 * Now recreate the situation of a word-aligned memcpy
+		 * with the current source and destination,
+		 * which may require an extra preload instruction.
+		 *
+		 * This path is currently disabled disabled in favour
+		 * of the one below this which does write alignment and
+		 * jumps into the main loop for larger sizes.
+		 */
+		bic	r3, r1, #(\line_size - 1)
+		pop	{r4}
+		cmp	r3, ip
+	THUMB(	pldne	[r3]				)
+	THUMB(	cmp	r2, #FAST_PATH_THRESHOLD	)
+	THUMB(	mov	ip, r3				)
+	ARM(	beq	31f				)
+	ARM(	pld	[r3]				)
+	ARM(	mov	ip, r3				)
+31:	ARM(	cmp	r2, #FAST_PATH_THRESHOLD	)
+		bgt	10b
+
+		/*
+		 * Recreate the fast path small size check here,
+		 * but only if it necessary.
+		 */
+.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) <= SMALL_SIZE_THRESHOLD ||
+\aligned_access == 1
+		cmp	r2, #SMALL_SIZE_THRESHOLD
+                pld     [ip, #\line_size]
+		/* Jump for small sizes <= SMALL_SIZE_THRESHOLD bytes. */
+		ble	5b
+.else
+		pld	[ip, #\line_size]
+.endif
+		bic	r3, r2, #15
+		b	9b
+
+#else
+		/*
+		 * Source and destination are now aligned. Check carefully
+		 * whether there are enough bytes to do alignment.
+		 */
+.if \write_align > 0
+.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < (\write_align - 4) \
+|| \aligned_access == 1
+		cmp	r2, #(\write_align - 4)
+		blt	31f
+.endif
+.if \write_align == 8
+		/*
+		 * For write alignment of 8, it is quickest to do a simple
+		 * conditional load/store.
+		 */
+		tst	r0, #4
+		ldrne	r3, [r1], #4
+		subne	r2, r2, #4
+		strne	r3, [r0], #4
+.else
+		ands	r3, r0, #(\write_align - 1)
+		rsb	r3, r3, #\write_align
+		beq	31f
+		push	{lr}
+		bl	20b
+		pop	{lr}
+.endif
+
+31:		/*
+		 * Check whether there are enough bytes to do one iteration
+		 * of the main loop.
+		 */
+.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3 - (\write_align - 4)) < \line_size \
+|| \aligned_access == 1
+		cmp	r2, #\line_size
+		blt	15b
+.endif
+		subs	r2, r2, #\line_size
+.else
+		/*
+		 * No write alignment. Only have to check for enough bytes to
+		 * do one iteration of the main loop.
+		 */
+
+.if (BOTH_UNALIGNED_SMALL_SIZE_THRESHOLD + 1 - 3) < \line_size \
+|| \aligned_access == 1
+		cmp	r2, #\line_size
+		blt	15b
+.endif
+		subs	r2, r2, #\line_size
+.endif
+		b	16b
+#endif
+
+40:		/*
+		 * Unaligned case. Size is > SMALL_SIZE_THRESHOLD - 3.
+		 */
+		bic	r1, r1, #3
+		cmp	r3, #2
+		ldr	r3, [r1], #4
+		beq	41f
+		bgt	42f
+
+		unaligned_copy 8, \line_size, \prefetch_distance, \
+			\write_align, \aligned_access
+
+41:		unaligned_copy 16, \line_size, \prefetch_distance, \
+			\write_align, \aligned_access
+
+42:		unaligned_copy 24, \line_size, \prefetch_distance, \
+			\write_align, \aligned_access
+
+.endm
+
+/*
+ * The following is a NEON-based memcpy implementation that may use unaligned
+ * access, but NEON instruction addresses are always at least element aligned.
+ * It is optimized for both Thumb2 (CONFIG_THUMB) and ARM mode.
+ *
+ * - line_size is the cache line size used for prefetches. Must be 64 or 32.
+ * - prefetch_distance is the number of cache lines to look ahead and must be
+ *   >= 2, or 0 to disable prefetching in the main copying loop.
+ * - early_prefetch indicates whether to perform early preloads. Must be 0 or 1.
+ *   When prefetch_distance > 0, early_prefetch should be 1. To remove all PLD
+ *   instructions altogether, set both prefetch_distance and early_prefetch
+ *   to 0.
+ */
+
+.macro neon_memcpy_variant line_size, prefetch_distance, early_prefetch
+
+		cmp	r2, #3
+.if \prefetch_distance > 0 || \early_prefetch == 1
+		push	{r0}
+.else
+		mov	ip, r0
+.endif
+		orr	r3, r0, r1
+		ble	8f
+.if \prefetch_distance > 0 || \early_prefetch == 1
+		bic	ip, r1, #(\line_size - 1)
+.endif
+		tst	r3, #3
+.if \early_prefetch == 1
+		pld	[ip]
+.endif
+		bne	10f		/* Unaligned source or destination. */
+		push	{r4}
+
+		/* Aligned source and destination. */
+1:		cmp	r2, #256
+		/*
+		 * Jump to word-aligned NEON fast path <= 256 bytes.
+                 */
+		ble	18f
+		subs	r2, r2, #\line_size
+
+                /* Align to a 32-byte boundary. */
+#ifdef CONFIG_THUMB
+		/*
+		 * Use conditional NEON instructions when
+		 * available (Thumb2 mode)
+		 */
+                ands    r4, r0, #31
+                rsb     r4, r4, #32
+                beq     31f
+                tst     r4, #4
+                sub     r2, r2, r4
+                ldrne   r3, [r1 :32], #4
+                strne   r3, [r0 :32], #4
+                tst     r4, #8
+		vld1ne.32 {d0}, [r1]!
+		vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
+                cmp     r4, #16
+		vld1ge.32 {d2, d3}, [r1]!
+		vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
+#else
+		/*
+		 * Otherwise, branch into a series of single
+		 * loads/stores.
+	 	 */
+                ands    r4, r0, #31
+                beq     31f
+		rsb	r3, r4, #32
+		lsl	r4, r4, #1
+		sub	r2, r2, r3
+		add	pc, pc, r4
+		nop
+		ldr	r3, [r1], #4
+		str	r3, [r0], #4
+		ldr	r4, [r1], #4
+		str	r4, [r0], #4
+		ldr	r3, [r1], #4
+		str	r3, [r0], #4
+		ldr	r4, [r1], #4
+		str	r4, [r0], #4
+		ldr	r3, [r1], #4
+		str	r3, [r0], #4
+		ldr	r4, [r1], #4
+		str	r4, [r0], #4
+		ldr	r3, [r1], #4
+		str	r3, [r0], #4
+		ldr	r4, [r1], #4
+		str	r4, [r0], #4
+#endif
+                cmp     r2, #0
+		addlt	r2, r2, \line_size
+                blt     6f
+
+31:
+.if \early_prefetch == 1
+		pld     [ip, #\line_size]
+.endif
+.if \prefetch_distance > 0
+                /*
+                 * Assume a preload at aligned base + line_size will
+                 * be useful.
+                 */
+		push	{r5}
+		mov	r4, ip
+                add     r5, r1, #(\prefetch_distance * \line_size)
+                subs    r2, r2, #(\prefetch_distance * \line_size)
+                bic     r3, r5, #(\line_size - 1)
+                add     r4, r4, #(2 * \line_size)
+                blt     5f
+                cmp     r4, r3
+                sub     ip, r3, r1
+                /*
+                 * "Catch-up" the early preloads (which have been performed up
+                 * to aligned source address + line_size) to the preload offset
+                 * used in the main loop.
+                 */
+                bge     3f
+2:              adds    r4, r4, #\line_size		/* Thumb16 */
+                cmp     r4, r3
+                pld     [r4, #(- \line_size)]
+                blt     2b
+3:
+.endif
+
+		sub	ip, ip, #\line_size
+4:
+                /*
+                 * Since the destination is 32-byte aligned,
+                 * specify 256-bit alignment for the NEON stores.
+                 */
+.if \line_size == 32
+		vld1.32 {d0-d3}, [r1]!
+                subs    r2, r2, #32
+.if \prefetch_distance > 0
+		pld	[r1, ip]
+.endif
+                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
+.else	/* line_size == 64 */
+		vld1.32 {d0-d3}, [r1]!
+                vld1.32 {d4-d7}, [r1]!
+.if \prefetch_distance > 0
+		pld	[r1, ip]
+.endif
+                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
+                subs    r2, r2, #64
+                vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
+.endif
+                bge     4b
+.if \prefetch_distance > 0
+5:
+.if \line_size == 32
+		vld1.32 {d0-d3}, [r1]!
+                subs    r2, r2, #32
+                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
+.else	/* line_size == 64 */
+		vld1.32 {d0-d3}, [r1]!
+                vld1.32 {d4-d7}, [r1]!
+                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
+                subs    r2, r2, #64
+                vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
+.endif
+        	cmn     r2, #(\prefetch_distance * \line_size)
+        	bge     5b
+.endif
+                /* Correct the count. */
+23:     	adds    r2, r2, #((\prefetch_distance + 1) * \line_size)
+.if \prefetch_distance > 0
+		pop	{r5}
+.endif
+
+                /*
+		 * Process the last 0-(line_size - 1) bytes, destination
+		 * 32-byte aligned, source word aligned.
+		 */
+6:
+#ifdef CONFIG_THUMB
+		/*
+		 * Use conditional NEON instructions when
+		 * available (Thumb2 mode).
+		 */
+.if \line_size == 64
+		cmp     r2, #32
+                vld1ge.32 {d0-d3}, [r1]!
+                vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]!
+                tst     r2, #16
+                vld1ne.32 {d0, d1}, [r1]!
+                vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
+.else
+                cmp     r2, #16
+                vld1ge.32 {d0, d1}, [r1]!
+                vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
+.endif
+                tst     r2, #8
+                vld1ne.32 {d2}, [r1]!
+                vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]!
+                tst     r2, #4
+                ldrne   r3, [r1], #4
+                strne   r3, [r0 :32], #4
+
+                pop     {r4}
+#else
+		/*
+		 * Just use the world-aligned tail code if we
+                 * don't have Thumb2.
+		 */
+		b	17f
+#endif
+
+	        /*
+		 * Handle the last up to three bytes. Unaligned access
+		 * may take place if source or destination is not
+		 * half-word aligned.
+		 */
+8:		movs    r2, r2, lsl #31
+                ldrhcs  r3, [r1], #2
+                strhcs  r3, [r0], #2
+                ldrbne  r3, [r1], #1
+                strbne  r3, [r0]
+9:
+.if \prefetch_distance > 0 || \early_prefetch == 1
+		pop	{r0}
+.else
+		mov	r0, ip
+.endif
+		bx	lr
+
+10:             /*
+                 * Unaligned case. Align the destination.
+                 * Number of bytes is > 3.
+                 * Note: This may use unaligned access.
+                 * ip is the line_size aligned source address for preloads.
+                 */
+		cmp	r2, #64
+                push    {r4}
+		/* For small sizes < 64 bytes just use the unaligned tail code. */
+		blt	16f
+		ands	r3, r0, #3
+                beq     11f     /* Destination is aligned but source is not. */
+                /* Align the destination. */
+		cmp	r3, #2
+                ldrbne  r4, [r1], #1
+                subne   r2, r2, #1
+                strbne  r4, [r0], #1
+                ldrhle  r4, [r1], #2
+                suble   r2, r2, #2
+                strhle  r4, [r0], #2
+                tst	r1, #3
+                beq     1b      /* Destination and source are now aligned. */
+		/* Destination is now aligned to a word boundary. */
+11:
+		cmp	r2, #64
+		/*
+		 * Jump to non-aligned NEON tail code for <= 64 bytes.
+                 */
+		ble	16f
+		subs	r2, r2, #\line_size
+
+                /* Align destination to a 32-byte boundary. */
+                ands    r4, r0, #31
+                rsb     r4, r4, #32
+                beq     20f
+                tst     r4, #4
+                sub     r2, r2, r4
+                ldrne   r3, [r1 :8], #4         /* Unaligned access. */
+                strne   r3, [r0 :32], #4
+                tst     r4, #8
+#ifdef CONFIG_THUMB
+		/*
+		 * Use conditional NEON instructions when
+		 * available (Thumb2 mode)
+		 */
+                vld1ne.8 {d0}, [r1]!
+                vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
+                cmp     r4, #16
+                vld1ge.8 {d2, d3}, [r1]!
+                vst1ge.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
+#else
+		beq	31f
+                vld1.8 {d0}, [r1]!
+                vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
+31:             cmp     r4, #16
+		blt	32f
+                vld1.8 {d2, d3}, [r1]!
+                vst1.64 {d2, d3}, [r0 NEON_ALIGN(128)]!
+32:
+#endif
+                cmp     r2, #0
+                addlt   r2, r2, #\line_size
+                blt     16f
+20:
+
+.if \early_prefetch == 1
+		pld     [ip, #\line_size]
+.endif
+.if \prefetch_distance > 0
+                /*
+                 * Assume a preload at aligned base + line_size will
+                 * be useful.
+                 */
+		push	{r5}
+		mov	r4, ip
+                add     r5, r1, #(\prefetch_distance * \line_size)
+                subs    r2, r2, #(\prefetch_distance * \line_size)
+                bic     r3, r5, #(\line_size - 1)
+                add     r4, r4, #(2 * \line_size)
+                blt     15f
+                cmp     r4, r3
+                sub     ip, r3, r1
+                /*
+                 * "Catch-up" the early preloads (which have been performed up
+                 * to aligned source address + line_size) to the preload offset
+                 * used in the main loop.
+                 */
+                bge     13f
+12:             adds    r4, r4, #\line_size		/* Thumb16 */
+                cmp     r4, r3
+                pld     [r4, #(- \line_size)]
+                blt     12b
+.endif
+
+13:
+                /*
+                 * Process 64 unaligned bytes from source at a time and copy
+                 * them to the 32-byte aligned destination.
+                 */
+14:
+.if \prefetch_distance > 0
+		pld     [r1, ip]
+.endif
+15:
+.if \line_size == 32
+		vld1.8  {d0-d3}, [r1]!
+                subs    r2, r2, #32
+                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
+.else	/* line_size == 64 */
+		vld1.8  {d0-d3}, [r1]!
+                vld1.8  {d4-d7}, [r1]!
+                vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
+                subs    r2, r2, #64
+                vst1.64 {d4-d7}, [r0 NEON_ALIGN(256)]!
+.endif
+                bge     14b
+.if \prefetch_distance > 0
+		cmn     r2, #(\prefetch_distance * \line_size)
+        	bge     15b
+.endif
+                /* Correct the count. */
+                adds    r2, r2, #((\prefetch_distance + 1) * \line_size)
+.if \prefetch_distance > 0
+        	pop     {r5}
+.endif
+
+		/*
+		 * Handle last 0-(line_size - 1) bytes (destination 32-byte
+		 * aligned source unaligned).
+		 */
+#ifdef CONFIG_THUMB
+		/*
+		 * Use conditional NEON instructions when
+		 * available (Thumb2 mode)
+		 */
+.if \line_size == 64
+		cmp     r2, #32
+                vld1ge.8 {d0-d3}, [r1]!
+                vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(128)]!
+                tst     r2, #16
+                vld1ne.8 {d0, d1}, [r1]!
+                vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
+.else
+                cmp     r2, #16
+                vld1ge.8 {d0, d1}, [r1]!
+                vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
+.endif
+                tst     r2, #8
+                vld1ne.8 {d2}, [r1]!
+                vst1ne.64 {d2}, [r0 NEON_ALIGN(64)]!
+                tst     r2, #4
+                ldrne   r3, [r1], #4
+                strne   r3, [r0 :32], #4
+
+                pop     {r4}
+		b	8b
+#else
+		/*
+		 * Fall through to the code below. It is not entirely
+		 * optimal because it does not indicate the destination
+		 * is word aligned.
+		 */
+#endif
+
+                /* Handle small size of 0-63 bytes, unaligned. */
+16:		bic	r3, r2, #7
+		rsb	r4, r3, #64
+		tst	r2, #7
+		add	pc, pc, r4
+		nop
+		vld1.8 {d0}, [r1]!
+		vst1.8 {d0}, [r0]!
+		vld1.8 {d1}, [r1]!
+		vst1.8 {d1}, [r0]!
+		vld1.8 {d0}, [r1]!
+		vst1.8 {d0}, [r0]!
+		vld1.8 {d1}, [r1]!
+		vst1.8 {d1}, [r0]!
+		vld1.8 {d0}, [r1]!
+		vst1.8 {d0}, [r0]!
+		vld1.8 {d1}, [r1]!
+		vst1.8 {d1}, [r0]!
+		vld1.8 {d0}, [r1]!
+		vst1.8 {d0}, [r0]!
+		vld1.8 {d1}, [r1]!
+		vst1.8 {d1}, [r0]!
+                pop     {r4}
+		beq	9b
+                tst     r2, #4
+                ldrne   r3, [r1 :8], #4         /* Unaligned access. */
+                strne   r3, [r0], #4
+		b	8b
+
+                /* Handle small size of 0-63 bytes, word aligned. */
+17:
+#ifdef CONFIG_THUMB
+		cmp     r2, #32
+                vld1ge.32 {d0-d3}, [r1]!
+                vst1ge.32 {d0-d3}, [r0]!
+                tst     r2, #16
+                vld1ne.32 {d0, d1}, [r1]!
+                vst1ne.32 {d0, d1}, [r0]!
+                tst     r2, #8
+                vld1ne.32 {d2}, [r1]!
+                vst1ne.32 {d2}, [r0]!
+		tst	r2, #7
+#else
+		bic	r3, r2, #7
+		rsb	r4, r3, #64
+		tst	r2, #7
+		add	pc, pc, r4
+		nop
+		vld1.32 {d0}, [r1]!
+		vst1.32 {d0}, [r0]!
+		vld1.32 {d1}, [r1]!
+		vst1.32 {d1}, [r0]!
+		vld1.32 {d0}, [r1]!
+		vst1.32 {d0}, [r0]!
+		vld1.32 {d1}, [r1]!
+		vst1.32 {d1}, [r0]!
+		vld1.32 {d0}, [r1]!
+		vst1.32 {d0}, [r0]!
+		vld1.32 {d1}, [r1]!
+		vst1.32 {d1}, [r0]!
+		vld1.32 {d0}, [r1]!
+		vst1.32 {d0}, [r0]!
+		vld1.32 {d1}, [r1]!
+		vst1.32 {d1}, [r0]!
+#endif
+                pop     {r4}
+		beq	9b
+		tst	r2, #4
+		ldrne	r3, [r1], #4
+		strne	r3, [r0], #4
+		b	8b
+
+		/*
+		 * Fast path for <= 256 bytes, word aligned.
+		 * This is hardcoded for a preload offset of 128 bytes,
+		 * which seems to work well in practice for small sizes.
+		 */
+18:		bics	r3, r2, #31
+.if \early_prefetch == 1
+		pld	[ip, #32]
+		beq	21f
+		pld	[ip, #64]
+		pld	[ip, #96]
+.endif
+		rsb	r4, r3, #256
+		ands	r2, r2, #31
+		/*
+		 * Each code block handling 32 bytes is
+                 * 12 bytes long.
+		 */
+		lsr	r4, r4, #2
+                add     ip, ip, #128
+		add	r4, r4, r4, lsr #1
+		sub	ip, ip, r1
+		add	pc, pc, r4
+		nop
+		pld	[r1, ip]
+		vld1.32 {d0-d3}, [r1]!
+		vst1.32 {d0-d3}, [r0]!
+		pld	[r1, ip]
+		vld1.32 {d4-d7}, [r1]!
+		vst1.32 {d4-d7}, [r0]!
+		pld	[r1, ip]
+		vld1.32 {d0-d3}, [r1]!
+		vst1.32 {d0-d3}, [r0]!
+		pld	[r1, ip]
+		vld1.32 {d4-d7}, [r1]!
+		vst1.32 {d4-d7}, [r0]!
+		pld	[r1, ip]
+		vld1.32 {d0-d3}, [r1]!
+		vst1.32 {d0-d3}, [r0]!
+		W(nop)
+		vld1.32 {d4-d7}, [r1]!
+		vst1.32 {d4-d7}, [r0]!
+		W(nop)
+		vld1.32 {d0-d3}, [r1]!
+		vst1.32 {d0-d3}, [r0]!
+		W(nop)
+		vld1.32 {d4-d7}, [r1]!
+		vst1.32 {d4-d7}, [r0]!
+		beq	19f
+21:
+#ifdef CONFIG_THUMB
+		cmp	r2, #16
+		vld1ge.32 {d0-d1}, [r1]!
+		vst1ge.32 {d0-d1}, [r0]!
+		tst	r2, #8
+		vld1ne.32 {d0}, [r1]!
+		vst1ne.32 {d0}, [r0]!
+#else
+		cmp	r2, #16
+		ldmiage r1!, {r3, r4}
+		stmiage r0!, {r3, r4}
+		ldmiage r1!, {r3, r4}
+		stmiage r0!, {r3, r4}
+		tst	r2, #8
+		ldmiane r1!, {r3, r4}
+		stmiane r0!, {r3, r4}
+#endif
+                tst     r2, #4
+		pop	{r4}
+                ldrne   r3, [r1], #4
+                strne   r3, [r0 :32], #4
+		and	r2, r2, #3
+		b	8b
+19:
+		pop	{r4}
+.if \prefetch_distance > 0 || \early_prefetch == 1
+		pop	{r0}
+.else
+		mov	r0, ip
+.endif
+		bx	lr
+.endm
+
+
+#if defined(MEMCPY_REPLACEMENT_RPI) || defined(MEMCPY_REPLACEMENT_ARMV7_32) \
+|| defined(MEMCPY_REPLACEMENT_ARMV7_64) || defined(MEMCPY_REPLACEMENT_NEON_32) \
+|| defined(MEMCPY_REPLACEMENT_NEON_64)
+
+#ifdef MEMCPY_REPLACEMENT_RPI
+asm_function memcpy
+		memcpy_variant 32, 3, 8, 0
+.endfunc
+#endif
+
+#ifdef MEMCPY_REPLACEMENT_ARMV7_32
+asm_function memcpy
+		memcpy_variant 32, 6, 0, 0
+.endfunc
+#endif
+
+#ifdef MEMCPY_REPLACEMENT_ARMV7_64
+asm_function memcpy
+		memcpy_variant 64, 3, 0, 0
+.endfunc
+#endif
+
+#ifdef MEMCPY_REPLACEMENT_NEON_32
+asm_function memcpy
+		neon_memcpy_variant 32, 6, 1
+.endfunc
+#endif
+
+#ifdef MEMCPY_REPLACEMENT_NEON_64
+asm_function memcpy
+		neon_memcpy_variant 64, 3, 1
+.endfunc
+#endif
+
+#ifdef MEMCPY_REPLACEMENT_NEON_AUTO
+asm_function memcpy
+		neon_memcpy_variant 32, 0, 1
+.endfunc
+#endif
+
+#else
+
+asm_function memcpy_new_line_size_64_preload_192
+		memcpy_variant 64, 3, 0, 0
+.endfunc
+
+asm_function memcpy_new_line_size_64_preload_192_align_32
+		memcpy_variant 64, 3, 32, 0
+.endfunc
+
+asm_function memcpy_new_line_size_64_preload_192_aligned_access
+		memcpy_variant 64, 3, 0, 1
+.endfunc
+
+asm_function memcpy_new_line_size_32_preload_192
+		memcpy_variant 32, 6, 0, 0
+.endfunc
+
+asm_function memcpy_new_line_size_32_preload_192_align_32
+		memcpy_variant 32, 6, 32, 0
+.endfunc
+
+asm_function memcpy_new_line_size_32_preload_96
+		memcpy_variant 32, 3, 8, 0
+.endfunc
+
+asm_function memcpy_new_line_size_32_preload_96_aligned_access
+		memcpy_variant 32, 3, 8, 1
+.endfunc
+
+asm_function memcpy_new_neon_line_size_64
+		neon_memcpy_variant 64, 3, 1
+.endfunc
+
+asm_function memcpy_new_neon_line_size_32
+		neon_memcpy_variant 32, 6, 1
+.endfunc
+
+asm_function memcpy_new_neon_line_size_32_auto
+		neon_memcpy_variant 32, 0, 1
+.endfunc
+
+#endif
+
+/*
+ * Macro for memset replacement.
+ * write_align must be 0, 8, or 32.
+ * use_neon must be 0 or 1.
+ */
+
+.macro memset_variant write_align, use_neon
+.if \use_neon == 1
+	.fpu neon
+.endif
+	ands	r3, r0, #3
+	mov	ip, r0
+	bne	7f
+
+	/* Destination is word aligned. */
+1:	orr	r1, r1, r1, lsl #8
+.if \use_neon == 1
+	cmp	r2, #16
+.else
+	cmp	r2, #8
+.endif
+	orr	r1, r1, r1, lsl #16
+.if \use_neon == 1
+	blt	13f
+        vmov	d0, r1, r1
+	vmov	d1, r1, r1
+.else
+	blt	5f
+	mov	r3, r1
+.endif
+
+	cmp	r2, #64
+	push 	{r4}
+.if \use_neon == 1
+	blt	10f
+.else
+	ble	10f
+.endif
+.if \write_align > 0
+	ands	r4, r0, #(\write_align - 1)
+.if \use_neon == 1
+#ifndef CONFIG_THUMB
+	add	r3, r4, #7
+#endif
+.endif
+	/* Let r4 be equal to the number of bytes to align.  */
+	rsb	r4, r4, #\write_align
+	/*
+	 * At this point r4 contains the number of bytes to align
+	 * if eq is not set. The eq flag is set if there are no bytes
+	 * to align.
+	 */
+.if \write_align == 8
+	subne	r2, r2, r4
+	strne	r1, [r0], #4
+.elseif \write_align == 32
+	beq	2f
+	tst     r4, #4
+	sub	r2, r2, r4
+	strne	r1, [r0], #4
+.if \use_neon == 1
+#ifdef CONFIG_THUMB
+	tst     r4, #8
+	vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
+	cmp	r4, #16
+	vst1ge.64 {d0, d1}, [r0 NEON_ALIGN(128)]!
+#else
+	bic	r4, r3, #7
+	lsr	r4, r4, #1
+	add	pc, pc, r4
+	nop
+	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
+	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
+	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
+	vst1.64 {d0}, [r0 NEON_ALIGN(64)]!
+#endif
+.else
+	tst     r4, #8
+	stmiane r0!, {r1, r3}
+	cmp	r4, #16
+	stmiage r0!, {r1, r3}
+	stmiage r0!, {r1, r3}
+.endif
+.endif	/* \write_align == 32 */
+	cmp	r2, #64
+	blt	4f
+.endif	/* \write_align > 0 */
+
+2:
+.if \use_neon == 1
+        /*
+         * When NEON is enabled, \write_align is
+         * equal to 32 so specify 256-bit alignment in the
+         * NEON store instructions.
+         */
+	subs	r2, r2, #64
+        vmov	q1, q0
+3:	vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
+	subs	r2, r2, #64
+        vst1.64 {d0-d3}, [r0 NEON_ALIGN(256)]!
+        bge     3b
+	adds	r2, r2, #64
+.else
+	mov	r4, r1
+	subs	r2, r2, #64
+	push	{r5}
+	mov	r5, r1
+
+3:	stmia	r0!, {r1, r3, r4, r5}
+	subs	r2, r2, #64		/* Thumb16 */
+	stmia	r0!, {r1, r3, r4, r5}
+	stmia	r0!, {r1, r3, r4, r5}
+	stmia	r0!, {r1, r3, r4, r5}
+	bge	3b
+	adds	r2, r2, #64		/* Thumb16 */
+
+	pop	{r5}
+.endif
+	/* Early exit if there are 0 bytes left. */
+/* THUMB(	cbz	r2, 9f	) */
+THUMB(	cmp	r2, #0	)
+THUMB(	beq	9f	)
+ARM(	teq	r2, #0	)
+ARM(	beq	9f	)
+	/*
+	 * Handle 8-64 bytes (or 16-63 bytes in case of NEON).
+	 * In case of NEON, destination must be 8-byte aligned.
+	 */
+4:
+.if \use_neon == 1
+#ifdef CONFIG_THUMB
+	vmov	q1, q0
+	cmp	r2, #32
+	vst1ge.64 {d0-d3}, [r0 NEON_ALIGN(64)]!
+	tst	r2, #16
+	vst1ne.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
+	tst	r2, #8
+	vst1ne.64 {d0}, [r0 NEON_ALIGN(64)]!
+	and	r2, r2, #7
+#else
+	bic	r4, r2, #15
+	subs	r2, r2, r4
+	rsb	r4, r4, #64
+	/*
+	 * When using NEON, the vst instruction
+	 * (storing 16 bytes) is always 32-bit.
+	 */
+	lsr	r4, r4, #2
+	add	pc, pc, r4
+	nop
+	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
+	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
+	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
+	vst1.64 {d0, d1}, [r0 NEON_ALIGN(64)]!
+	cmp	r2, #8
+	strge	r1, [r0], #4
+	strge	r1, [r0], #4
+	subge	r2, r2, #8
+#endif
+.else	/* use_neon == 0 */
+	bic	r4, r2, #7
+	subs	r2, r2, r4
+	rsb	r4, r4, #64
+	/*
+	 * The stmia instruction (storing 8 bytes) is 32-bit for ARM,
+	 * 16-bit for Thumb2.
+	 */
+THUMB(	lsrs	r4, r4, #2	)
+ARM(	lsr	r4, r4, #1	)
+	add	pc, pc, r4
+	nop
+	stmia	r0!, {r1, r3}
+	stmia	r0!, {r1, r3}
+	stmia	r0!, {r1, r3}
+	stmia	r0!, {r1, r3}
+	stmia	r0!, {r1, r3}
+	stmia	r0!, {r1, r3}
+	stmia	r0!, {r1, r3}
+	stmia	r0!, {r1, r3}
+.endif
+14:	pop	{r4}
+
+5:	cmp	r2, #4
+	strge	r1, [r0], #4
+	/* Early exit for multiple of 4 size. */
+	ands	r2, r2, #3
+	moveq	r0, ip
+	bxeq	lr
+
+	/*
+	 * At this point there are 1, 2 or 3 bytes,
+	 * and the destination is aligned.
+	 */
+6:	cmp	r2, #2
+	strhge	r1, [r0], #2
+	strbne	r1, [r0]
+	mov	r0, ip
+	bx	lr
+
+.if \use_neon == 1
+	/* 0-15 bytes left, word aligned. */
+13:	cmp	r2, #8
+	strge	r1, [r0]
+	strge	r1, [r0, #4]
+	addge	r0, r0, #8
+	subge	r2, r2, #8
+	b	5b
+.endif
+
+	/* Unaligned case. */
+7:	cmp	r2, #4
+	blt	8f
+#ifdef CONFIG_THUMB
+.if \use_neon == 1
+	/*
+	 * When Thumb2 is enabled with NEON, use the optimized
+	 * unaligned NEON code path for small sizes.
+	 */
+	cmp 	r2, #64
+	blt	11f
+.endif
+#endif
+	/* Align the destination. */
+	cmp	r3, #2
+	sub	r2, r2, #4
+	strble	r1, [r0]
+	strble	r1, [r0, #1]
+	addle	r0, r0, #2
+	add	r2, r2, r3
+	strbne	r1, [r0], #1
+	b	1b
+
+	/* 0 to 3 bytes left. */
+8:	cmp	r2, #2
+	strbge  r1, [r0]
+	strbge  r1, [r0, #1]
+	addge	r0, r0, #2
+	tst	r2, #1
+	strbne  r1, [r0]
+	mov	r0, ip
+	bx	lr
+
+9:	pop	{r4}
+	mov	r0, ip
+	bx	lr
+
+	/*
+	 * Word aligned 8 <= size <= 64
+	 * (16 <= size <= 63 in case of NEON).
+	 */
+10:
+	/* Align the destination to an 8 byte boundary. */
+	tst     r0, #4
+	strne	r1, [r0], #4
+	subne	r2, r2, #4
+.if \use_neon == 1
+	cmp	r2, #16
+	poplt	{r4}
+	blt	13b
+.else
+	cmp	r2, #8
+	blt	14b
+.endif
+	b	4b
+
+#ifdef CONFIG_THUMB
+.if \use_neon == 1
+	/*
+	 * Handle 4 <= size <= 63 bytes, unaligned.
+	 * Use unaligned NEON instructions with Thumb2.
+	 */
+11:
+	orr	r1, r1, r1, lsl #8
+	tst	r2, #8
+	orr	r1, r1, r1, lsl #16
+        vmov	d0, r1, r1
+	vst1ne.8 {d0}, [r0]!
+	vmov	d1, r1, r1
+	tst	r2, #16
+	vst1ne.8 {d0, d1}, [r0]!
+	vmov	q1, q0
+	cmp	r2, #32
+	and	r2, r2, #7
+	vst1ge.8 {d0-d3}, [r0]!
+	cmp	r2, #4
+	/* The following store is unaligned. */
+	strge	r1, [r0], #4
+	subge	r2, r2, #4
+	b	8b
+.endif
+#endif
+.endm
+
+#if defined(MEMSET_REPLACEMENT_RPI) || defined(MEMSET_REPLACEMENT_ARMV7_32) \
+|| defined(MEMSET_REPLACEMENT_ARMV7_64) || defined(MEMSET_REPLACEMENT_NEON_32) \
+|| defined(MEMSET_REPLACEMENT_NEON_64)
+
+#ifdef MEMSET_REPLACEMENT_RPI
+asm_function memset
+		memset_variant 32, 0
+.endfunc
+#endif
+
+#if defined(MEMSET_REPLACEMENT_ARMV7_32) || defined(MEMSET_REPLACEMENT_ARMV7_64)
+asm_function memset
+		memset_variant 8, 0
+.endfunc
+#endif
+
+#if defined(MEMSET_REPLACEMENT_NEON_32) || defined(MEMSET_REPLACEMENT_NEON_64)
+asm_function memset
+		memset_variant 32, 1
+.endfunc
+#endif
+
+#else
+
+asm_function memset_new_align_0
+		memset_variant 0, 0
+.endfunc
+
+asm_function memset_new_align_8
+		memset_variant 8, 0
+.endfunc
+
+asm_function memset_new_align_32
+		memset_variant 32, 0
+.endfunc
+
+asm_function memset_neon
+		memset_variant 32, 1
+.endfunc
+
+#endif
diff --git a/veejay-current/veejay-server/thirdparty/fastarm/new_arm.h b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.h
new file mode 100644
index 00000000..4d13699e
--- /dev/null
+++ b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.h
@@ -0,0 +1,35 @@
+
+extern void *memcpy_new_line_size_64_preload_192(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_64_preload_192_align_32(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_64_preload_192_aligned_access(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_32_preload_192(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_32_preload_192_align_32(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_32_preload_96(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_32_preload_96_aligned_access(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_neon_line_size_64(void *dest, const void *src, size_t n);
+
+extern void *memcpy_new_neon_line_size_32(void *dest, const void *src, size_t n);
+
+extern void *memcpy_new_neon_line_size_32_auto(void *dest, const void *src, size_t n);
+
+extern void *memset_new_align_0(void *dest, int c, size_t size);
+
+extern void *memset_new_align_8(void *dest, int c, size_t size);
+
+extern void *memset_new_align_32(void *dest, int c, size_t size);
+
+extern void *memset_neon(void *dest, int c, size_t size);
diff --git a/veejay-current/veejay-server/veejay/Makefile.am b/veejay-current/veejay-server/veejay/Makefile.am
index 7c4f621d..753aad89 100644
--- a/veejay-current/veejay-server/veejay/Makefile.am
+++ b/veejay-current/veejay-server/veejay/Makefile.am
@@ -66,6 +66,10 @@ if !HAVE_MJPEGTOOLS
 libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/mjpegtools -lmjpegutils
 endif
 
+if HAVE_ARM
+libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/fastarm -lfastarm
+endif
+
 libveejay_la_LDFLAGS +=	$(SDL_LIBS) $(SDL_TTF_LIBS) $(DIRECTFB_LIBS) $(X_LIBS) $(PTHREAD_LIBS) $(FT_LDFLAGS) $(FT_LIBS) \
 			$(XML2_LIBS) $(JPEG_LIBS) $(LIBLO_LIBS) $(LIBUNWIND_LIBS) $(GLIB_LIBS) \
 		 	$(FFMPEG_LIBS) $(XINERAMA_LIBS) $(MJPEGTOOLS_LIBS) $(LIBPNG_LIBS) \
diff --git a/veejay-current/veejay-server/veejay/veejay.c b/veejay-current/veejay-server/veejay/veejay.c
index 0b4a2cef..75b84cd0 100644
--- a/veejay-current/veejay-server/veejay/veejay.c
+++ b/veejay-current/veejay-server/veejay/veejay.c
@@ -788,7 +788,7 @@ int main(int argc, char **argv)
 	{
 		veejay_free(info);
 		return 0;
-    }
+ 	}
 
 	print_license();