add fastarm memcpy

2025-12-14 11:50:02 +01:00 · 2016-02-13 18:53:36 +01:00
parent 61f7de936c
commit 1eb4ff18eb
10 changed files with 2172 additions and 160 deletions
--- a/veejay-current/veejay-server/configure.ac
+++ b/veejay-current/veejay-server/configure.ac
@@ -376,6 +376,27 @@ esac

 CFLAGS="$CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES"

+if test x$host_alias != x; then
+	dnl Cross compiling
+	AC_MSG_CHECKING(sub-architecture settings)
+	if test x$have_x86cpu = xtrue; then
+		host_mod_cpu=`echo $host_cpu|tr _ -`
+		ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu"
+		AC_MSG_RESULT($ARCHFLAGS)
+	fi
+else
+	AC_MSG_CHECKING(sub-architecture settings)
+	
+	chmod +x $srcdir/cpuinfo.sh
+
+	if test "$arch_target" = "auto"; then
+		TMP=`$srcdir/cpuinfo.sh`
+		ARCHFLAGS=`cat veejay.arch`
+	else
+       		ARCHFLAGS="-mtune=generic"
+	fi
+		AC_MSG_RESULT($ARCHFLAGS)
+fi

 dnl ARM architecture detect NEON and set CFLAGS
 if test x$have_arm = xtrue
@@ -392,8 +413,11 @@ then
 	if test $ac_cv_flag_neon = yes ; then
 		AC_DEFINE(HAVE_ARM_NEON,1,[Compiling in NEON support])
 		USER_CFLAGS="-mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad $USER_CFLAGS"
+		FASTARM_CFLAGS="$ARCHFLAGS -Wa,-march=armv7-a -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB"
+
 	else
 		USER_CFLAGS="-march=native -ftree-vectorize $USER_CFLAGS"
+		FASTARM_CFLAGS="$ARCHFLAGS -Wa, -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB"
 	fi
 	
 	if test "x$enable_debug" != "xyes" ; then
@@ -406,6 +430,8 @@ then
 	SUBSAMPLE_CFLAGS="$USER_CFLAGS"
 	VJE_CFLAGS="$USER_CFLAGS"
 	CFLAGS="$USER_CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES"
+
+	AC_SUBST(FASTARM_CFLAGS)
 fi

 dnl This flag is used for PROGRAMS not SHARED LIBRARIES.  PIC code is required
@@ -584,28 +610,6 @@ EOF
    fi
 fi

-if test x$host_alias != x; then
-	dnl Cross compiling
-	AC_MSG_CHECKING(sub-architecture settings)
-	if test x$have_x86cpu = xtrue; then
-		host_mod_cpu=`echo $host_cpu|tr _ -`
-		ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu"
-		AC_MSG_RESULT($ARCHFLAGS)
-	fi
-else
-	AC_MSG_CHECKING(sub-architecture settings)
-	
-	chmod +x $srcdir/cpuinfo.sh
-
-	if test "$arch_target" = "auto"; then
-		TMP=`$srcdir/cpuinfo.sh`
-		ARCHFLAGS=`cat veejay.arch`
-	else
-       		ARCHFLAGS="-mtune=generic"
-	fi
-		AC_MSG_RESULT($ARCHFLAGS)
-fi
-
 have_mjpegtools=false
 AC_SUBST(MJPEGTOOLS_CFLAGS)
 AC_SUBST(MJPGETOOLS_LIBS)
@@ -1074,6 +1078,7 @@ AM_CONDITIONAL(HAVE_JPEG,test x$have_jpeg = xtrue)
 AM_CONDITIONAL(HAVE_LIBLO,test x$have_liblo = xtrue)
 AM_CONDITIONAL(HAVE_FREETYPE2, test x$have_freetype2 = xtrue)
 AM_CONDITIONAL(HAVE_MJPEGTOOLS, test x$have_mjpegtools = xtrue )
+AM_CONDITIONAL(HAVE_ARM, test x$have_arm = xtrue )
 dnl *********************************************************************
 dnl Check for what warnings we want gcc to use and adjust the CFLAGS
 dnl as needed. This only works for GCC.
@@ -1161,6 +1166,7 @@ fi

 AC_CONFIG_FILES([
 thirdparty/Makefile
+thirdparty/fastarm/Makefile
 thirdparty/aclib/Makefile
 thirdparty/bio2jack/Makefile
 thirdparty/libhash/Makefile
--- a/veejay-current/veejay-server/libvjmem/memcpy.c
+++ b/veejay-current/veejay-server/libvjmem/memcpy.c
@@ -141,6 +141,10 @@
 #include <libvje/vje.h>
 #include <veejay/vj-task.h>
 #include <libavutil/cpu.h>
+#ifdef HAVE_ARM
+#include <fastarm/new_arm.h>
+#endif
+
 #define BUFSIZE 1024


@@ -157,37 +161,12 @@
 static int selected_best_memcpy = 1;
 static int selected_best_memset = 1;

-#ifdef HAVE_POSIX_TIMERS
-static int64_t _x_gettime(void)
+static double get_time()
 {
-	struct timespec tm;
-	return (clock_gettime(CLOCK_THREAD_CPUTIME_ID,&tm) == -1 )
-			? times(NULL)
-			: (int64_t) tm.tv_sec * 1e9 + tm.tv_nsec;
+	struct timespec ts;
+	clock_gettime( CLOCK_REALTIME, &ts );
+	return (double) ts.tv_sec + (double) ts.tv_nsec / 1000000000.0;
 }
-#define rdtsc(x) _x_gettime()
-#elif (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(HAVE_SYS_TIMES_H)
-static int64_t rdtsc(int cpu_flags)
-{
-	int64_t x;
-	if( cpu_flags & AV_CPU_FLAGS_MMX ) {
-			__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
-			return x;
-	} else {
-			return times(NULL);
-	}
-}
-#else
-static uint64_t rdtsc(int cpu_flags)
-{
-#ifdef HAVE_SYS_TIMES_H
-		struct tms tp;
-		return times(&tp);
-#else
-		return clock();
-#endif
-}
-#endif /* HAVE_SYS_TIMES_H */

 #if defined(ARCH_X86) || defined (ARCH_X86_64)
 /* for small memory blocks (<256 bytes) this version is faster */
@@ -251,7 +230,6 @@ void	yuyv_plane_clear( size_t len, void *to )
 	if( vj_task_available() ) {
 		uint8_t * t    = (uint8_t*) to;
 		uint8_t *in[4] = { t, NULL,NULL,NULL };
-		int 	strides[4] = { len, 0,0,0 };
 		vj_task_run( in, in, NULL, NULL, 1, (performer_job_routine) &yuyv_plane_clear_job );
 	}
 	else {
@@ -1349,10 +1327,10 @@ static void *memcpy_neon( void *to, const void *from, size_t n )


 static struct {
-     char                 *name;
-     void               *(*function)(void *to, const void *from, size_t len);
-     uint64_t	   time;
-     uint32_t		cpu_require;
+     char	*name;
+     void	*(*function)(void *to, const void *from, size_t len);
+     double	t;
+     uint32_t cpu_require;
 } memcpy_method[] =
 {
     { NULL, NULL, 0},
@@ -1382,23 +1360,44 @@ static struct {
 #endif  
 #ifdef HAVE_ARM_NEON
 	{ "NEON optimized memcpy()", (void*) memcpy_neon, 0, AV_CPU_FLAG_NEON },
+#endif
+#ifdef HAVE_ARM
+	{ "new mempcy for cortex with line size of 32, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_line_size_32_preload_192,0,0 },
+	{ "new memcpy for cortex with line size of 64, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>" ,(void*) memcpy_new_line_size_64_preload_192, 0, 0 },
+    { "new memcpy for cortex with line size of 64, preload offset of 192, aligned access (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_line_size_64_preload_192_aligned_access, 0, 0 },
+	{ "new memcpy for cortex with line size of 32, preload offset of 192, align 32", (void*) memcpy_new_line_size_32_preload_192_align_32,0,0},
+	{ "new memcpy for cortex with line size of 32, preload offset of 96", (void*) memcpy_new_line_size_32_preload_96,0,0},
+	{ "new memcpy for cortex with line size of 32, preload offset of 96, aligned access", (void*) memcpy_new_line_size_32_preload_96_aligned_access,0,0},
+#endif
+#ifdef HAVE_ARM_NEON
+	{ "new memcpy for cortex using NEON with line size of 32, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_neon_line_size_32,0,AV_CPU_FLAG_NEON},
+	{ "new memcpy for cortex using NEON with line size of 64, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_neon_line_size_64,0,AV_CPU_FLAG_NEON},
+	{ "new mempcy for cortex using NEON with line size of 32, automatic prefetcher (C) Harm Hanemaaijer <fgenfb@yayhoo.com>", (void*) memcpy_new_neon_line_size_32_auto,0,AV_CPU_FLAG_NEON},
 #endif
     { NULL, NULL, 0},
 };

 static struct {
-     char                 *name;
-     void                *(*function)(void *to, uint8_t c, size_t len);
-	 uint64_t	    time;
-     uint32_t		cpu_require;
+	char            *name;
+	void            *(*function)(void *to, uint8_t c, size_t len);
+	uint32_t		cpu_require;
+	double			t;
 } memset_method[] =
 {
-     { NULL, NULL, 0,0},
-     { "glibc memset()",            (void*)memset, 0,0},
+	{ NULL, NULL, 0,0},
+	{ "glibc memset()",(void*)memset,0,0},
 #if defined(HAVE_ASM_MMX) || defined(HAVE_ASM_MMX2) || defined(HAVE_ASM_SSE)
-     { "MMX/MMX2/SSE optimized memset()", (void*)   fast_memset, 0, AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2},
+	{ "MMX/MMX2/SSE optimized memset()", (void*) fast_memset,0,AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2 },
 #endif 
-       { NULL, NULL, 0,0},
+#ifdef HAVE_ARM_NEON
+	{ "memset_neon (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_neon,0, AV_CPU_FLAG_NEON },
+#endif
+#ifdef HAVE_ARM
+	{ "memset align 0 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_0,0,0 },
+	{ "memset align 8 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_8,0,0 },
+	{ "memset align 32 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_32,0,0 },
+#endif
+	{ NULL, NULL, 0, 0},
 };


@@ -1407,10 +1406,10 @@ void	memcpy_report()
 	int i;
 	fprintf(stdout,"SIMD benchmark results:\n");
 	for( i = 1; memset_method[i].name; i ++ ) {
-		fprintf(stdout,"\t%8ld : %s\n",(long) memset_method[i].time,  memset_method[i].name );
+		fprintf(stdout,"\t%g : %s\n",memset_method[i].t,  memset_method[i].name );
 	}
 	for( i = 1; memcpy_method[i].name; i ++ ) {
-		fprintf(stdout,"\t%8ld : %s\n",(long) memcpy_method[i].time,  memcpy_method[i].name );
+		fprintf(stdout,"\t%g : %s\n",memcpy_method[i].t,  memcpy_method[i].name );
 	}
 }

@@ -1430,7 +1429,7 @@ char *get_memset_descr()

 void find_best_memcpy()
 {
-     uint64_t t;
+     double t;
     char *buf1, *buf2;
     int i, best = 0,k;
     int bufsize = 720 * 576 * 3;
@@ -1445,6 +1444,8 @@ void find_best_memcpy()

     int cpu_flags = av_get_cpu_flags();
 	
+	veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memcpy ..." );	
+
     memset(buf1,0, bufsize);
     memset(buf2,0, bufsize);

@@ -1454,28 +1455,28 @@ void find_best_memcpy()

 	for( i = 1; memcpy_method[i].name; i ++ ) {
 		
-		t = rdtsc(cpu_flags);
+		t = get_time();
 		if( memcpy_method[i].cpu_require && !(cpu_flags & memcpy_method[i].cpu_require ) ) {
-			memcpy_method[i].time = 0;
+			memcpy_method[i].t = 0.0;
 			continue;
 		}

 		for( k = 0; k < 128; k ++ ) {
 			memcpy_method[i].function( buf1,buf2, bufsize );
 		}
-		t = rdtsc(cpu_flags) - t;
-		memcpy_method[i].time = t;
+		t = get_time() - t;
+		memcpy_method[i].t = t;
 	}

 	for( i = 1; memcpy_method[i].name; i ++ ) {
 		if(best == 0 ) { 
 			best = i;
-		    t = memcpy_method[i].time;	
+		    t = memcpy_method[i].t;	
 			continue;
 		}

-		if( memcpy_method[i].time < t && memcpy_method[i].time > 0 ) {
-			t = memcpy_method[i].time;
+		if( memcpy_method[i].t < t && memcpy_method[i].t > 0 ) {
+			t = memcpy_method[i].t;
 			best = i;
 		}
 	}
@@ -1494,53 +1495,55 @@ void find_best_memcpy()

 void find_best_memset()
 {
-    uint64_t t;
-    char *buf1, *buf2;
-    int i, best = 0,k;
+	double t;
+	char *buf1, *buf2;
+	int i, best = 0,k;
 	int bufsize = 720 * 576 * 3;
 	int cpu_flags = av_get_cpu_flags();
 	
-     if (!(buf1 = (char*) malloc( bufsize * sizeof(char) )))
-          return;
+	if (!(buf1 = (char*) malloc( bufsize * sizeof(char) )))
+        	return;
+
+	if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) {
+		free( buf1 );
+		return;
+	}
+
+	veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memset..." );

-     if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) {
-          free( buf1 );
-          return;
-     }
-	
 	memset( buf1, 0, bufsize * sizeof(char));
 	memset( buf2, 0, bufsize * sizeof(char));

-     for (i=1; memset_method[i].name; i++)
-     {
+	for (i=1; memset_method[i].name; i++)
+	{
 		if( memset_method[i].cpu_require && !(cpu_flags & memset_method[i].cpu_require ) ) {
-			memset_method[i].time= 0;
+			memset_method[i].t= 0;
 			continue;
-		}
+	}

-        t = rdtsc(cpu_flags);
-		for( k = 0; k < 128; k ++ ) {
-			memset_method[i].function( buf1 , 0 , bufsize );
-		}
-        t = rdtsc(cpu_flags) - t;
-	  
-        memset_method[i].time = t;
+	t = get_time();
+	for( k = 0; k < 128; k ++ ) {
+		memset_method[i].function( buf1 , 0 , bufsize );
+	}
+	t = get_time() - t;
+ 
+	memset_method[i].t = t;

-        if (best == 0 || t < memset_method[best].time)
-        	best = i;
-     }
+	if (best == 0 || t < memset_method[best].t)
+		best = i;
+	}	

-     if (best) {
-          veejay_memset = memset_method[best].function;
-     } else {
-		  veejay_memset = memset_method[1].function;
-	 }
+	if (best) {
+		veejay_memset = memset_method[best].function;
+	} 
+	else {
+	  veejay_memset = memset_method[1].function;
+	}

 	selected_best_memset = best;

-	 free( buf1 );
-     free( buf2 );
-
+	free( buf1 );
+	free( buf2 );
 }

 static	void	vj_frame_copy_job( void *arg ) {
@@ -1721,122 +1724,118 @@ void	vj_frame_clear1( uint8_t *input, unsigned int val, int size )
 	vj_frame_clear( in, strides, val );
 }

-static unsigned long benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
+static double benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
 {
-	uint64_t k;
-	uint64_t stats[c];
+	int k;
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );

 	for( k = 0; k < c; k ++ )	
 	{
-		uint64_t t = rdtsc(0);
+		double t = get_time();
 		vj_frame_slow_single( source, source, dest, planes[0], planes[1]/2, 0.67f );
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}

-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c ;k ++ )
 		sum += stats[k];

-	uint64_t best_time = (sum / c );
+	double best_time = (sum / c );

-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
-			(float)(bytes /1048576.0f), (best_time/1000.0f));
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);

 	return best_time;
 }


-static unsigned long benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
+static double benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
 {
-	uint64_t k;
-	uint64_t stats[c];
+	int k;
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );

 	for( k = 0; k < c; k ++ )	
 	{
-		uint64_t t = rdtsc(0);
+		uint64_t t = get_time();
 		vj_frame_slow_threaded( source, source, dest, planes[0], planes[1]/2, 0.67f );
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}

-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c ;k ++ )
 		sum += stats[k];

-	uint64_t best_time = (sum / c );
+	double best_time = (sum / c );

-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
-			(float)(bytes /1048576.0f), (best_time/1000.0f));
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);

 	return best_time;
 }


-static unsigned long benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes)
+static double benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes)
 {
-	uint64_t k;
-	uint64_t stats[c];
+	int k;
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );

 	for( k = 0; k < c; k ++ )	
 	{
-		uint64_t t = rdtsc(0);
+		double t = get_time();
 		vj_frame_copyN( source,dest,planes );
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}

-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c ;k ++ )
 		sum += stats[k];

-	uint64_t best_time = (sum / c );
+	double best_time = (sum / c );

-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
-			(float)(bytes /1048576.0f), (best_time/1000.0f));
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);

 	return best_time;
 }

-static unsigned long benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes)
+static double benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes)
 {
-	uint64_t k; int j;
-	uint64_t stats[c];
+	int k; int j;
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );

 	for( k = 0; k < c; k ++ ) {
-		uint64_t t = rdtsc(0);
+		double t = get_time();
 		for( j = 0; j < 4; j ++ ) {
 			veejay_memcpy( dest[j], source[j], planes[j] );
 		}
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}

-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c; k ++ ) 
 		sum += stats[k];

-	uint64_t best_time = (sum/c);
+	double best_time = (sum/c);
 	
-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
-			(float)(bytes /1048576.0f), (best_time/1000.0f));
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);

 	return best_time;
 }

-typedef unsigned long (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes);
+typedef double (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes);


 void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames, uint8_t **dest, uint8_t **source, int *planes )
 {
-	uint32_t N = 8;
-	uint64_t stats[N];	
+	int N = 8;
+	double stats[N];	
 	uint32_t i;
-	uint64_t fastest = 0;
+	double fastest = 0.0;
 	float work_size = (planes[0] + planes[1] + planes[2] + planes[3]) / 1048576.0f;

 	veejay_msg(VEEJAY_MSG_INFO, "run test '%s' (%dx) on chunks of %2.2f MB:", str, N, work_size );
@@ -1848,8 +1847,8 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames,
 			fastest = stats[i];
 	}
 	
-	uint64_t sum = 0;
-	uint64_t slowest=fastest;
+	double sum = 0.0;
+	double slowest=fastest;
 	for( i = 0; i < N; i ++ )
 	{
 		if( stats[i] < fastest ) {
@@ -1860,8 +1859,7 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames,

 	float average = (sum / N);

-	veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %2.4f ms, worst is %2.4f ms, average is %2.4f ms", 
-		str, fastest/1000.0f, slowest/1000.0f, average/1000.0f );
+	veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %g, worst is %g, average is %g",str, fastest, slowest, average );
 }

 void benchmark_tasks(int n_tasks, long n_frames, int w, int h)
@@ -1914,16 +1912,19 @@ void	benchmark_veejay(int w, int h)
 	if( h < 64)
 		h = 64;

+	veejay_msg(VEEJAY_MSG_INFO, "Starting benchmark %dx%d YUVP 4:2:2 (100 frames)", w,h);
+
 	int n_tasks = task_num_cpus();
-	init_parallel_tasks( n_tasks );
 	char *str2 = getenv( "VEEJAY_MULTITHREAD_TASKS" );
 	if( str2 ) {
 		n_tasks = atoi(str2);
 	}
-	
-	int n_frames = 100;
-	veejay_msg(VEEJAY_MSG_INFO, "Benchmark %dx%d YUVP 4:2:2 (%d frames)", w,h,n_frames);
-	benchmark_tasks( n_tasks, n_frames,w,h );
+
+	veejay_msg(VEEJAY_MSG_INFO, "VEEJAY_MULTITHREAD_TASKS=%d", n_tasks );
+
+	init_parallel_tasks( n_tasks );
+
+	benchmark_tasks( n_tasks,100,w,h );
 }

 void	*vj_hmalloc(size_t sze, const char *name)
--- a/veejay-current/veejay-server/libvjmem/vj-x86.c
+++ b/veejay-current/veejay-server/libvjmem/vj-x86.c
@@ -183,10 +183,6 @@ int	vj_mem_threaded_init(int w, int h)
 			num_tasks = n_cpus;
 			if( num_tasks < 1 )
 				num_tasks = 1;
-
-			if( num_tasks > 1 )
-				veejay_msg( VEEJAY_MSG_INFO, "Using %d threads scheduled over %d cpus in performer.", num_tasks, n_cpus-1 );
-
 		}
 		
 	}
--- a/veejay-current/veejay-server/thirdparty/Makefile.am
+++ b/veejay-current/veejay-server/thirdparty/Makefile.am
@@ -3,3 +3,8 @@ SUBDIRS = aclib bio2jack libhash liblzo libOSC libresample
 if !HAVE_MJPEGTOOLS
 SUBDIRS += mjpegtools 
 endif
+
+if HAVE_ARM
+SUBDIRS += fastarm
+endif
+
--- a/veejay-current/veejay-server/thirdparty/fastarm/Makefile.am
+++ b/veejay-current/veejay-server/thirdparty/fastarm/Makefile.am
@@ -0,0 +1,10 @@
+# Makefile for veejay
+MAINTAINERCLEANFILES = Makefile.in
+AM_CFLAGS = $(FASTARM_CFLAGS)
+AM_CPPFLAGS = -I$(top_srcdir) -I$(includedir) \
+              -I$(top_srcdir)/thirdparty $(FASTARM_CFLAGS) 
+
+FASTARM_LIB_FILE = libfastarm.la
+noinst_LTLIBRARIES = $(FASTARM_LIB_FILE)
+libfastarm_la_SOURCES = new_arm.S
+EXTRA_DIST= 
--- a/veejay-current/veejay-server/thirdparty/fastarm/README
+++ b/veejay-current/veejay-server/thirdparty/fastarm/README
@@ -0,0 +1,97 @@
+fastarm
+
+This toolkit contains a set of fast memcpy/memset variants for ARM
+platforms. They either use the standard register file, or optionally
+NEON instructions,
+
+Several basic families of variants are provided; the current ones are
+the "new memcpy" variants which are the default for memcpy replacement,
+which generally do not overfetch beyond the source region and can be
+configured to use unaligned memory access for small sizes, or to use
+strictly aligned memory access. This family can also be configured to
+include a fast path for smaller sizes (this is the default), disabling
+this results in smaller code size at the expense of worse performance
+for small sizes. NEON optimized versions, which are generally faster
+with reduced code size, are also provided.
+
+To compile the benchmark program, run 'make'. This will compile in a
+plethora of variants with different preload strategies, block sizes,
+alignment etc.
+
+A benchmark program to compare various memcpy variants is provided. Try
+something like "./benchmark --memcpy ad --all". (Use --memcpy al on the
+Raspberry Pi platform).
+
+To compile a memcpy replacement library, set PLATFORM to one of the
+values described at the beginning of the Makefile. This selects the
+cache line size to use and whether to use NEON versions.
+
+Optionally disable Thumb2 mode compilation by commenting out the THUMBFLAGS
+definition. It must be disabled on the Raspberry Pi.
+
+Then run:
+
+    sudo make install_memcpy_replacement
+
+The replacement memcpy/memset shared library will be installed into
+/usr/lib/arm-linux-gnueabihf/ as libfastarm.so.
+
+To enable the use of the replacement memcpy in applications, create or edit
+the file /etc/ld.so.preload so that it contains the line:
+
+    /usr/lib/arm-linux-gnueabihf/libfastarm.so
+
+On the RPi platform, references to libcofi_rpi.so should be commented out
+or deleted. The new memcpy should now be activated for newly launched
+programs. To be sure, reboot or run:
+
+    sudo ldconfig
+
+To revert to the default optimized memcpy on the RPi platform,
+edit /etc/ld.so.preload so that it contains the line:
+
+    /usr/lib/arm-linux-gnueabihf/libcofi_rpi.so
+
+instead of the one using libfastarm.so.
+
+Note on cache line size:
+
+Although assuming a preload line size 64 bytes is a little faster on several
+Cortex platforms for small to moderate sizes, when accessing DRAM
+with larger sizes assuming 32 byte preloads seems to be faster. On earlier
+Cortex A9 models, 32 byte preloads are required for good performance in all
+cases.
+
+Notes on performance with and without NEON:
+
+For NEON-based memcpy, a significant benefit is seen on the tested Cortex A8
+platform for unaligned copies in cache memory and for aligned and unaligned
+copies in DRAM. Performance for aligned copies in cache memory is relatively
+similar to the optimized non-NEON function.
+
+Results in MB/s on a Cortex A8, with Thumb2 mode enabled, of
+standard libc (Debian unstable), armv7 and NEON optimized memcpy
+variants with line size of 32 bytes:
+
+		libc	armv7	NEON
+test 0		522	549	567
+test 1		329	377	378
+test 2		434	430	513
+test 28		351	361	458
+test 29		246	248	358
+test 43		467	512	581
+
+Test 0 in the benchmark program tests word-aligned requests with
+sizes that are a power of 2 up to 4096 bytes distributed according
+to a power law.
+Test 1 in the benchmark program tests word-aligned requests with
+sizes up to 1024 that are a multiple of 4, distributed according
+to a power law.
+Test 2 in the benchmark program tests unaligned requests with sizes
+up to 1023 bytes.
+Test 28 in the benchmark program tests word aligned requests in DRAM
+with sizes up to 1024 bytes.
+Test 29 in the benchmark program tests word aligned requests in DRAM
+with sizes up to 256 bytes.
+Test 43 in the benchmark program tests page aligned requests in DRAM
+of size 4096 (copying a memory page).
--- a/veejay-current/veejay-server/thirdparty/fastarm/new_arm.S
+++ b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.S
--- a/veejay-current/veejay-server/thirdparty/fastarm/new_arm.h
+++ b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.h
@@ -0,0 +1,35 @@
+
+extern void *memcpy_new_line_size_64_preload_192(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_64_preload_192_align_32(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_64_preload_192_aligned_access(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_32_preload_192(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_32_preload_192_align_32(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_32_preload_96(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_line_size_32_preload_96_aligned_access(void *dest,
+    const void *src, size_t n);
+
+extern void *memcpy_new_neon_line_size_64(void *dest, const void *src, size_t n);
+
+extern void *memcpy_new_neon_line_size_32(void *dest, const void *src, size_t n);
+
+extern void *memcpy_new_neon_line_size_32_auto(void *dest, const void *src, size_t n);
+
+extern void *memset_new_align_0(void *dest, int c, size_t size);
+
+extern void *memset_new_align_8(void *dest, int c, size_t size);
+
+extern void *memset_new_align_32(void *dest, int c, size_t size);
+
+extern void *memset_neon(void *dest, int c, size_t size);
--- a/veejay-current/veejay-server/veejay/Makefile.am
+++ b/veejay-current/veejay-server/veejay/Makefile.am
@@ -66,6 +66,10 @@ if !HAVE_MJPEGTOOLS
 libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/mjpegtools -lmjpegutils
 endif

+if HAVE_ARM
+libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/fastarm -lfastarm
+endif
+
 libveejay_la_LDFLAGS +=	$(SDL_LIBS) $(SDL_TTF_LIBS) $(DIRECTFB_LIBS) $(X_LIBS) $(PTHREAD_LIBS) $(FT_LDFLAGS) $(FT_LIBS) \
 			$(XML2_LIBS) $(JPEG_LIBS) $(LIBLO_LIBS) $(LIBUNWIND_LIBS) $(GLIB_LIBS) \
 		 	$(FFMPEG_LIBS) $(XINERAMA_LIBS) $(MJPEGTOOLS_LIBS) $(LIBPNG_LIBS) \
--- a/veejay-current/veejay-server/veejay/veejay.c
+++ b/veejay-current/veejay-server/veejay/veejay.c
@@ -788,7 +788,7 @@ int main(int argc, char **argv)
 	{
 		veejay_free(info);
 		return 0;
-    }
+ 	}

 	print_license();