add fastarm memcpy

2025-12-15 20:30:00 +01:00 · 2016-02-13 18:53:36 +01:00
parent 61f7de936c
commit 1eb4ff18eb
10 changed files with 2172 additions and 160 deletions
--- a/veejay-current/veejay-server/configure.ac
+++ b/veejay-current/veejay-server/configure.ac
@@ -376,6 +376,27 @@ esac
 CFLAGS="$CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES"
 if test x$host_alias != x; then
 	dnl Cross compiling
 	AC_MSG_CHECKING(sub-architecture settings)
 	if test x$have_x86cpu = xtrue; then
 		host_mod_cpu=`echo $host_cpu|tr _ -`
 		ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu"
 		AC_MSG_RESULT($ARCHFLAGS)
 	fi
 else
 	AC_MSG_CHECKING(sub-architecture settings)
 	chmod +x $srcdir/cpuinfo.sh
 	if test "$arch_target" = "auto"; then
 		TMP=`$srcdir/cpuinfo.sh`
 		ARCHFLAGS=`cat veejay.arch`
 	else
       		ARCHFLAGS="-mtune=generic"
 	fi
 		AC_MSG_RESULT($ARCHFLAGS)
 fi
 dnl ARM architecture detect NEON and set CFLAGS
 if test x$have_arm = xtrue
@@ -392,8 +413,11 @@ then
 	if test $ac_cv_flag_neon = yes ; then
 		AC_DEFINE(HAVE_ARM_NEON,1,[Compiling in NEON support])
 		USER_CFLAGS="-mfpu=neon -ftree-vectorize -mvectorize-with-neon-quad $USER_CFLAGS"
 		FASTARM_CFLAGS="$ARCHFLAGS -Wa,-march=armv7-a -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB"
 	else
 		USER_CFLAGS="-march=native -ftree-vectorize $USER_CFLAGS"
 		FASTARM_CFLAGS="$ARCHFLAGS -Wa, -mthumb -Wa,-mthumb -Wa,-mimplicit-it=always -mthumb-interwork -DCONFIG_THUMB"
 	fi
 	if test "x$enable_debug" != "xyes" ; then
@@ -406,6 +430,8 @@ then
 	SUBSAMPLE_CFLAGS="$USER_CFLAGS"
 	VJE_CFLAGS="$USER_CFLAGS"
 	CFLAGS="$USER_CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES"
 	AC_SUBST(FASTARM_CFLAGS)
 fi
 dnl This flag is used for PROGRAMS not SHARED LIBRARIES.  PIC code is required
@@ -584,28 +610,6 @@ EOF
    fi
 fi
 if test x$host_alias != x; then
 	dnl Cross compiling
 	AC_MSG_CHECKING(sub-architecture settings)
 	if test x$have_x86cpu = xtrue; then
 		host_mod_cpu=`echo $host_cpu|tr _ -`
 		ARCHFLAGS="-march=$host_mod_cpu -mcpu=$host_mod_cpu"
 		AC_MSG_RESULT($ARCHFLAGS)
 	fi
 else
 	AC_MSG_CHECKING(sub-architecture settings)
 	chmod +x $srcdir/cpuinfo.sh
 	if test "$arch_target" = "auto"; then
 		TMP=`$srcdir/cpuinfo.sh`
 		ARCHFLAGS=`cat veejay.arch`
 	else
       		ARCHFLAGS="-mtune=generic"
 	fi
 		AC_MSG_RESULT($ARCHFLAGS)
 fi
 have_mjpegtools=false
 AC_SUBST(MJPEGTOOLS_CFLAGS)
 AC_SUBST(MJPGETOOLS_LIBS)
@@ -1074,6 +1078,7 @@ AM_CONDITIONAL(HAVE_JPEG,test x$have_jpeg = xtrue)
 AM_CONDITIONAL(HAVE_LIBLO,test x$have_liblo = xtrue)
 AM_CONDITIONAL(HAVE_FREETYPE2, test x$have_freetype2 = xtrue)
 AM_CONDITIONAL(HAVE_MJPEGTOOLS, test x$have_mjpegtools = xtrue )
 AM_CONDITIONAL(HAVE_ARM, test x$have_arm = xtrue )
 dnl *********************************************************************
 dnl Check for what warnings we want gcc to use and adjust the CFLAGS
 dnl as needed. This only works for GCC.
@@ -1161,6 +1166,7 @@ fi
 AC_CONFIG_FILES([
 thirdparty/Makefile
 thirdparty/fastarm/Makefile
 thirdparty/aclib/Makefile
 thirdparty/bio2jack/Makefile
 thirdparty/libhash/Makefile
--- a/veejay-current/veejay-server/libvjmem/memcpy.c
+++ b/veejay-current/veejay-server/libvjmem/memcpy.c
@@ -141,6 +141,10 @@
 #include <libvje/vje.h>
 #include <veejay/vj-task.h>
 #include <libavutil/cpu.h>
 #ifdef HAVE_ARM
 #include <fastarm/new_arm.h>
 #endif
 #define BUFSIZE 1024
@@ -157,37 +161,12 @@
 static int selected_best_memcpy = 1;
 static int selected_best_memset = 1;
-#ifdef HAVE_POSIX_TIMERS
+static double get_time()
 static int64_t _x_gettime(void)
 {
-	struct timespec tm;
+	struct timespec ts;
-	return (clock_gettime(CLOCK_THREAD_CPUTIME_ID,&tm) == -1 )
+	clock_gettime( CLOCK_REALTIME, &ts );
-			? times(NULL)
+	return (double) ts.tv_sec + (double) ts.tv_nsec / 1000000000.0;
 			: (int64_t) tm.tv_sec * 1e9 + tm.tv_nsec;
 }
 #define rdtsc(x) _x_gettime()
 #elif (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(HAVE_SYS_TIMES_H)
 static int64_t rdtsc(int cpu_flags)
 {
 	int64_t x;
 	if( cpu_flags & AV_CPU_FLAGS_MMX ) {
 			__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
 			return x;
 	} else {
 			return times(NULL);
 	}
 }
 #else
 static uint64_t rdtsc(int cpu_flags)
 {
 #ifdef HAVE_SYS_TIMES_H
 		struct tms tp;
 		return times(&tp);
 #else
 		return clock();
 #endif
 }
 #endif /* HAVE_SYS_TIMES_H */
 #if defined(ARCH_X86) || defined (ARCH_X86_64)
 /* for small memory blocks (<256 bytes) this version is faster */
@@ -251,7 +230,6 @@ void	yuyv_plane_clear( size_t len, void *to )
 	if( vj_task_available() ) {
 		uint8_t * t    = (uint8_t*) to;
 		uint8_t *in[4] = { t, NULL,NULL,NULL };
 		int 	strides[4] = { len, 0,0,0 };
 		vj_task_run( in, in, NULL, NULL, 1, (performer_job_routine) &yuyv_plane_clear_job );
 	}
 	else {
@@ -1349,10 +1327,10 @@ static void *memcpy_neon( void *to, const void *from, size_t n )
 static struct {
-     char                 *name;
+     char	*name;
-     void               *(*function)(void *to, const void *from, size_t len);
+     void	*(*function)(void *to, const void *from, size_t len);
-     uint64_t	   time;
+     double	t;
-     uint32_t		cpu_require;
+     uint32_t cpu_require;
 } memcpy_method[] =
 {
     { NULL, NULL, 0},
@@ -1382,23 +1360,44 @@ static struct {
 #endif  
 #ifdef HAVE_ARM_NEON
 	{ "NEON optimized memcpy()", (void*) memcpy_neon, 0, AV_CPU_FLAG_NEON },
 #endif
 #ifdef HAVE_ARM
 	{ "new mempcy for cortex with line size of 32, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_line_size_32_preload_192,0,0 },
 	{ "new memcpy for cortex with line size of 64, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>" ,(void*) memcpy_new_line_size_64_preload_192, 0, 0 },
    { "new memcpy for cortex with line size of 64, preload offset of 192, aligned access (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_line_size_64_preload_192_aligned_access, 0, 0 },
 	{ "new memcpy for cortex with line size of 32, preload offset of 192, align 32", (void*) memcpy_new_line_size_32_preload_192_align_32,0,0},
 	{ "new memcpy for cortex with line size of 32, preload offset of 96", (void*) memcpy_new_line_size_32_preload_96,0,0},
 	{ "new memcpy for cortex with line size of 32, preload offset of 96, aligned access", (void*) memcpy_new_line_size_32_preload_96_aligned_access,0,0},
 #endif
 #ifdef HAVE_ARM_NEON
 	{ "new memcpy for cortex using NEON with line size of 32, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_neon_line_size_32,0,AV_CPU_FLAG_NEON},
 	{ "new memcpy for cortex using NEON with line size of 64, preload offset of 192 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memcpy_new_neon_line_size_64,0,AV_CPU_FLAG_NEON},
 	{ "new mempcy for cortex using NEON with line size of 32, automatic prefetcher (C) Harm Hanemaaijer <fgenfb@yayhoo.com>", (void*) memcpy_new_neon_line_size_32_auto,0,AV_CPU_FLAG_NEON},
 #endif
     { NULL, NULL, 0},
 };
 static struct {
-     char                 *name;
+	char            *name;
-     void                *(*function)(void *to, uint8_t c, size_t len);
+	void            *(*function)(void *to, uint8_t c, size_t len);
-	 uint64_t	    time;
+	uint32_t		cpu_require;
-     uint32_t		cpu_require;
+	double			t;
 } memset_method[] =
 {
-     { NULL, NULL, 0,0},
+	{ NULL, NULL, 0,0},
-     { "glibc memset()",            (void*)memset, 0,0},
+	{ "glibc memset()",(void*)memset,0,0},
 #if defined(HAVE_ASM_MMX) || defined(HAVE_ASM_MMX2) || defined(HAVE_ASM_SSE)
-     { "MMX/MMX2/SSE optimized memset()", (void*)   fast_memset, 0, AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2},
+	{ "MMX/MMX2/SSE optimized memset()", (void*) fast_memset,0,AV_CPU_FLAG_MMX|AV_CPU_FLAG_SSE|AV_CPU_FLAG_MMX2 },
 #endif 
-       { NULL, NULL, 0,0},
+#ifdef HAVE_ARM_NEON
 	{ "memset_neon (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_neon,0, AV_CPU_FLAG_NEON },
 #endif
 #ifdef HAVE_ARM
 	{ "memset align 0 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_0,0,0 },
 	{ "memset align 8 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_8,0,0 },
 	{ "memset align 32 (C) Harm Hanemaaijer <fgenfb@yahoo.com>", (void*) memset_new_align_32,0,0 },
 #endif
 	{ NULL, NULL, 0, 0},
 };
@@ -1407,10 +1406,10 @@ void	memcpy_report()
 	int i;
 	fprintf(stdout,"SIMD benchmark results:\n");
 	for( i = 1; memset_method[i].name; i ++ ) {
-		fprintf(stdout,"\t%8ld : %s\n",(long) memset_method[i].time,  memset_method[i].name );
+		fprintf(stdout,"\t%g : %s\n",memset_method[i].t,  memset_method[i].name );
 	}
 	for( i = 1; memcpy_method[i].name; i ++ ) {
-		fprintf(stdout,"\t%8ld : %s\n",(long) memcpy_method[i].time,  memcpy_method[i].name );
+		fprintf(stdout,"\t%g : %s\n",memcpy_method[i].t,  memcpy_method[i].name );
 	}
 }
@@ -1430,7 +1429,7 @@ char *get_memset_descr()
 void find_best_memcpy()
 {
-     uint64_t t;
+     double t;
     char *buf1, *buf2;
     int i, best = 0,k;
     int bufsize = 720 * 576 * 3;
@@ -1445,6 +1444,8 @@ void find_best_memcpy()
     int cpu_flags = av_get_cpu_flags();
 	veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memcpy ..." );	
     memset(buf1,0, bufsize);
     memset(buf2,0, bufsize);
@@ -1454,28 +1455,28 @@ void find_best_memcpy()
 	for( i = 1; memcpy_method[i].name; i ++ ) {
-		t = rdtsc(cpu_flags);
+		t = get_time();
 		if( memcpy_method[i].cpu_require && !(cpu_flags & memcpy_method[i].cpu_require ) ) {
-			memcpy_method[i].time = 0;
+			memcpy_method[i].t = 0.0;
 			continue;
 		}
 		for( k = 0; k < 128; k ++ ) {
 			memcpy_method[i].function( buf1,buf2, bufsize );
 		}
-		t = rdtsc(cpu_flags) - t;
+		t = get_time() - t;
-		memcpy_method[i].time = t;
+		memcpy_method[i].t = t;
 	}
 	for( i = 1; memcpy_method[i].name; i ++ ) {
 		if(best == 0 ) { 
 			best = i;
-		    t = memcpy_method[i].time;	
+		    t = memcpy_method[i].t;	
 			continue;
 		}
-		if( memcpy_method[i].time < t && memcpy_method[i].time > 0 ) {
+		if( memcpy_method[i].t < t && memcpy_method[i].t > 0 ) {
-			t = memcpy_method[i].time;
+			t = memcpy_method[i].t;
 			best = i;
 		}
 	}
@@ -1494,53 +1495,55 @@ void find_best_memcpy()
 void find_best_memset()
 {
-    uint64_t t;
+	double t;
-    char *buf1, *buf2;
+	char *buf1, *buf2;
-    int i, best = 0,k;
+	int i, best = 0,k;
 	int bufsize = 720 * 576 * 3;
 	int cpu_flags = av_get_cpu_flags();
-     if (!(buf1 = (char*) malloc( bufsize * sizeof(char) )))
+	if (!(buf1 = (char*) malloc( bufsize * sizeof(char) )))
-          return;
+        	return;
 	if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) {
 		free( buf1 );
 		return;
 	}
 	veejay_msg(VEEJAY_MSG_DEBUG, "Finding best memset..." );
     if (!(buf2 = (char*) malloc( bufsize * sizeof(char) ))) {
          free( buf1 );
          return;
     }
 	memset( buf1, 0, bufsize * sizeof(char));
 	memset( buf2, 0, bufsize * sizeof(char));
-     for (i=1; memset_method[i].name; i++)
+	for (i=1; memset_method[i].name; i++)
-     {
+	{
 		if( memset_method[i].cpu_require && !(cpu_flags & memset_method[i].cpu_require ) ) {
-			memset_method[i].time= 0;
+			memset_method[i].t= 0;
 			continue;
-		}
+	}
-        t = rdtsc(cpu_flags);
+	t = get_time();
-		for( k = 0; k < 128; k ++ ) {
+	for( k = 0; k < 128; k ++ ) {
-			memset_method[i].function( buf1 , 0 , bufsize );
+		memset_method[i].function( buf1 , 0 , bufsize );
-		}
+	}
-        t = rdtsc(cpu_flags) - t;
+	t = get_time() - t;
-	  
+ 
-        memset_method[i].time = t;
+	memset_method[i].t = t;
-        if (best == 0 || t < memset_method[best].time)
+	if (best == 0 || t < memset_method[best].t)
-        	best = i;
+		best = i;
-     }
+	}	
-     if (best) {
+	if (best) {
-          veejay_memset = memset_method[best].function;
+		veejay_memset = memset_method[best].function;
-     } else {
+	} 
-		  veejay_memset = memset_method[1].function;
+	else {
-	 }
+	  veejay_memset = memset_method[1].function;
 	}
 	selected_best_memset = best;
-	 free( buf1 );
+	free( buf1 );
-     free( buf2 );
+	free( buf2 );
 }
 static	void	vj_frame_copy_job( void *arg ) {
@@ -1721,122 +1724,118 @@ void	vj_frame_clear1( uint8_t *input, unsigned int val, int size )
 	vj_frame_clear( in, strides, val );
 }
-static unsigned long benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
+static double benchmark_single_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
 {
-	uint64_t k;
+	int k;
-	uint64_t stats[c];
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
 	for( k = 0; k < c; k ++ )	
 	{
-		uint64_t t = rdtsc(0);
+		double t = get_time();
 		vj_frame_slow_single( source, source, dest, planes[0], planes[1]/2, 0.67f );
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}
-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c ;k ++ )
 		sum += stats[k];
-	uint64_t best_time = (sum / c );
+	double best_time = (sum / c );
-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
 			(float)(bytes /1048576.0f), (best_time/1000.0f));
 	return best_time;
 }
-static unsigned long benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
+static double benchmark_threaded_slow(long c, int n_tasks, uint8_t **source, uint8_t **dest, int *planes)
 {
-	uint64_t k;
+	int k;
-	uint64_t stats[c];
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
 	for( k = 0; k < c; k ++ )	
 	{
-		uint64_t t = rdtsc(0);
+		uint64_t t = get_time();
 		vj_frame_slow_threaded( source, source, dest, planes[0], planes[1]/2, 0.67f );
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}
-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c ;k ++ )
 		sum += stats[k];
-	uint64_t best_time = (sum / c );
+	double best_time = (sum / c );
-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
 			(float)(bytes /1048576.0f), (best_time/1000.0f));
 	return best_time;
 }
-static unsigned long benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes)
+static double benchmark_threaded_copy(long c, int n_tasks, uint8_t **dest, uint8_t **source, int *planes)
 {
-	uint64_t k;
+	int k;
-	uint64_t stats[c];
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
 	for( k = 0; k < c; k ++ )	
 	{
-		uint64_t t = rdtsc(0);
+		double t = get_time();
 		vj_frame_copyN( source,dest,planes );
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}
-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c ;k ++ )
 		sum += stats[k];
-	uint64_t best_time = (sum / c );
+	double best_time = (sum / c );
-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
 			(float)(bytes /1048576.0f), (best_time/1000.0f));
 	return best_time;
 }
-static unsigned long benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes)
+static double benchmark_single_copy(long c,int dummy, uint8_t **dest, uint8_t **source, int *planes)
 {
-	uint64_t k; int j;
+	int k; int j;
-	uint64_t stats[c];
+	double stats[c];
 	uint64_t bytes = ( planes[0] + planes[1] + planes[2] + planes[3] );
 	for( k = 0; k < c; k ++ ) {
-		uint64_t t = rdtsc(0);
+		double t = get_time();
 		for( j = 0; j < 4; j ++ ) {
 			veejay_memcpy( dest[j], source[j], planes[j] );
 		}
-		t = rdtsc(0) - t;
+		t = get_time() - t;
 		stats[k] = t;
 	}
-	uint64_t sum = 0;
+	double sum = 0.0;
 	for( k = 0; k < c; k ++ ) 
 		sum += stats[k];
-	uint64_t best_time = (sum/c);
+	double best_time = (sum/c);
-	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %2.2f ms",
+	veejay_msg(VEEJAY_MSG_DEBUG, "%.2f MB data in %g",(float)((bytes*c) /1048576.0f), best_time);
 			(float)(bytes /1048576.0f), (best_time/1000.0f));
 	return best_time;
 }
-typedef unsigned long (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes);
+typedef double (*benchmark_func)(long c, int dummy, uint8_t **dest, uint8_t **source, int *planes);
 void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames, uint8_t **dest, uint8_t **source, int *planes )
 {
-	uint32_t N = 8;
+	int N = 8;
-	uint64_t stats[N];	
+	double stats[N];	
 	uint32_t i;
-	uint64_t fastest = 0;
+	double fastest = 0.0;
 	float work_size = (planes[0] + planes[1] + planes[2] + planes[3]) / 1048576.0f;
 	veejay_msg(VEEJAY_MSG_INFO, "run test '%s' (%dx) on chunks of %2.2f MB:", str, N, work_size );
@@ -1848,8 +1847,8 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames,
 			fastest = stats[i];
 	}
-	uint64_t sum = 0;
+	double sum = 0.0;
-	uint64_t slowest=fastest;
+	double slowest=fastest;
 	for( i = 0; i < N; i ++ )
 	{
 		if( stats[i] < fastest ) {
@@ -1860,8 +1859,7 @@ void run_benchmark_test(int n_tasks, benchmark_func f, char *str, int n_frames,
 	float average = (sum / N);
-	veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %2.4f ms, worst is %2.4f ms, average is %2.4f ms", 
+	veejay_msg(VEEJAY_MSG_INFO, "run done: best score for %s is %g, worst is %g, average is %g",str, fastest, slowest, average );
 		str, fastest/1000.0f, slowest/1000.0f, average/1000.0f );
 }
 void benchmark_tasks(int n_tasks, long n_frames, int w, int h)
@@ -1914,16 +1912,19 @@ void	benchmark_veejay(int w, int h)
 	if( h < 64)
 		h = 64;
 	veejay_msg(VEEJAY_MSG_INFO, "Starting benchmark %dx%d YUVP 4:2:2 (100 frames)", w,h);
 	int n_tasks = task_num_cpus();
 	init_parallel_tasks( n_tasks );
 	char *str2 = getenv( "VEEJAY_MULTITHREAD_TASKS" );
 	if( str2 ) {
 		n_tasks = atoi(str2);
 	}
-	
+
-	int n_frames = 100;
+	veejay_msg(VEEJAY_MSG_INFO, "VEEJAY_MULTITHREAD_TASKS=%d", n_tasks );
-	veejay_msg(VEEJAY_MSG_INFO, "Benchmark %dx%d YUVP 4:2:2 (%d frames)", w,h,n_frames);
+
-	benchmark_tasks( n_tasks, n_frames,w,h );
+	init_parallel_tasks( n_tasks );
 	benchmark_tasks( n_tasks,100,w,h );
 }
 void	*vj_hmalloc(size_t sze, const char *name)
--- a/veejay-current/veejay-server/libvjmem/vj-x86.c
+++ b/veejay-current/veejay-server/libvjmem/vj-x86.c
@@ -183,10 +183,6 @@ int	vj_mem_threaded_init(int w, int h)
 			num_tasks = n_cpus;
 			if( num_tasks < 1 )
 				num_tasks = 1;
 			if( num_tasks > 1 )
 				veejay_msg( VEEJAY_MSG_INFO, "Using %d threads scheduled over %d cpus in performer.", num_tasks, n_cpus-1 );
 		}
 	}
--- a/veejay-current/veejay-server/thirdparty/Makefile.am
+++ b/veejay-current/veejay-server/thirdparty/Makefile.am
@@ -3,3 +3,8 @@ SUBDIRS = aclib bio2jack libhash liblzo libOSC libresample
 if !HAVE_MJPEGTOOLS
 SUBDIRS += mjpegtools 
 endif
 if HAVE_ARM
 SUBDIRS += fastarm
 endif
--- a/veejay-current/veejay-server/thirdparty/fastarm/Makefile.am
+++ b/veejay-current/veejay-server/thirdparty/fastarm/Makefile.am
@@ -0,0 +1,10 @@
 # Makefile for veejay
 MAINTAINERCLEANFILES = Makefile.in
 AM_CFLAGS = $(FASTARM_CFLAGS)
 AM_CPPFLAGS = -I$(top_srcdir) -I$(includedir) \
              -I$(top_srcdir)/thirdparty $(FASTARM_CFLAGS) 
 FASTARM_LIB_FILE = libfastarm.la
 noinst_LTLIBRARIES = $(FASTARM_LIB_FILE)
 libfastarm_la_SOURCES = new_arm.S
 EXTRA_DIST= 
--- a/veejay-current/veejay-server/thirdparty/fastarm/README
+++ b/veejay-current/veejay-server/thirdparty/fastarm/README
@@ -0,0 +1,97 @@
 fastarm
 This toolkit contains a set of fast memcpy/memset variants for ARM
 platforms. They either use the standard register file, or optionally
 NEON instructions,
 Several basic families of variants are provided; the current ones are
 the "new memcpy" variants which are the default for memcpy replacement,
 which generally do not overfetch beyond the source region and can be
 configured to use unaligned memory access for small sizes, or to use
 strictly aligned memory access. This family can also be configured to
 include a fast path for smaller sizes (this is the default), disabling
 this results in smaller code size at the expense of worse performance
 for small sizes. NEON optimized versions, which are generally faster
 with reduced code size, are also provided.
 To compile the benchmark program, run 'make'. This will compile in a
 plethora of variants with different preload strategies, block sizes,
 alignment etc.
 A benchmark program to compare various memcpy variants is provided. Try
 something like "./benchmark --memcpy ad --all". (Use --memcpy al on the
 Raspberry Pi platform).
 To compile a memcpy replacement library, set PLATFORM to one of the
 values described at the beginning of the Makefile. This selects the
 cache line size to use and whether to use NEON versions.
 Optionally disable Thumb2 mode compilation by commenting out the THUMBFLAGS
 definition. It must be disabled on the Raspberry Pi.
 Then run:
    sudo make install_memcpy_replacement
 The replacement memcpy/memset shared library will be installed into
 /usr/lib/arm-linux-gnueabihf/ as libfastarm.so.
 To enable the use of the replacement memcpy in applications, create or edit
 the file /etc/ld.so.preload so that it contains the line:
    /usr/lib/arm-linux-gnueabihf/libfastarm.so
 On the RPi platform, references to libcofi_rpi.so should be commented out
 or deleted. The new memcpy should now be activated for newly launched
 programs. To be sure, reboot or run:
    sudo ldconfig
 To revert to the default optimized memcpy on the RPi platform,
 edit /etc/ld.so.preload so that it contains the line:
    /usr/lib/arm-linux-gnueabihf/libcofi_rpi.so
 instead of the one using libfastarm.so.
 Note on cache line size:
 Although assuming a preload line size 64 bytes is a little faster on several
 Cortex platforms for small to moderate sizes, when accessing DRAM
 with larger sizes assuming 32 byte preloads seems to be faster. On earlier
 Cortex A9 models, 32 byte preloads are required for good performance in all
 cases.
 Notes on performance with and without NEON:
 For NEON-based memcpy, a significant benefit is seen on the tested Cortex A8
 platform for unaligned copies in cache memory and for aligned and unaligned
 copies in DRAM. Performance for aligned copies in cache memory is relatively
 similar to the optimized non-NEON function.
 Results in MB/s on a Cortex A8, with Thumb2 mode enabled, of
 standard libc (Debian unstable), armv7 and NEON optimized memcpy
 variants with line size of 32 bytes:
 		libc	armv7	NEON
 test 0		522	549	567
 test 1		329	377	378
 test 2		434	430	513
 test 28		351	361	458
 test 29		246	248	358
 test 43		467	512	581
 Test 0 in the benchmark program tests word-aligned requests with
 sizes that are a power of 2 up to 4096 bytes distributed according
 to a power law.
 Test 1 in the benchmark program tests word-aligned requests with
 sizes up to 1024 that are a multiple of 4, distributed according
 to a power law.
 Test 2 in the benchmark program tests unaligned requests with sizes
 up to 1023 bytes.
 Test 28 in the benchmark program tests word aligned requests in DRAM
 with sizes up to 1024 bytes.
 Test 29 in the benchmark program tests word aligned requests in DRAM
 with sizes up to 256 bytes.
 Test 43 in the benchmark program tests page aligned requests in DRAM
 of size 4096 (copying a memory page).
--- a/veejay-current/veejay-server/thirdparty/fastarm/new_arm.S
+++ b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.S
--- a/veejay-current/veejay-server/thirdparty/fastarm/new_arm.h
+++ b/veejay-current/veejay-server/thirdparty/fastarm/new_arm.h
@@ -0,0 +1,35 @@
 extern void *memcpy_new_line_size_64_preload_192(void *dest,
    const void *src, size_t n);
 extern void *memcpy_new_line_size_64_preload_192_align_32(void *dest,
    const void *src, size_t n);
 extern void *memcpy_new_line_size_64_preload_192_aligned_access(void *dest,
    const void *src, size_t n);
 extern void *memcpy_new_line_size_32_preload_192(void *dest,
    const void *src, size_t n);
 extern void *memcpy_new_line_size_32_preload_192_align_32(void *dest,
    const void *src, size_t n);
 extern void *memcpy_new_line_size_32_preload_96(void *dest,
    const void *src, size_t n);
 extern void *memcpy_new_line_size_32_preload_96_aligned_access(void *dest,
    const void *src, size_t n);
 extern void *memcpy_new_neon_line_size_64(void *dest, const void *src, size_t n);
 extern void *memcpy_new_neon_line_size_32(void *dest, const void *src, size_t n);
 extern void *memcpy_new_neon_line_size_32_auto(void *dest, const void *src, size_t n);
 extern void *memset_new_align_0(void *dest, int c, size_t size);
 extern void *memset_new_align_8(void *dest, int c, size_t size);
 extern void *memset_new_align_32(void *dest, int c, size_t size);
 extern void *memset_neon(void *dest, int c, size_t size);
--- a/veejay-current/veejay-server/veejay/Makefile.am
+++ b/veejay-current/veejay-server/veejay/Makefile.am
@@ -66,6 +66,10 @@ if !HAVE_MJPEGTOOLS
 libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/mjpegtools -lmjpegutils
 endif
 if HAVE_ARM
 libveejay_la_LIBADD+=-L$(top_builddir)/thirdparty/fastarm -lfastarm
 endif
 libveejay_la_LDFLAGS +=	$(SDL_LIBS) $(SDL_TTF_LIBS) $(DIRECTFB_LIBS) $(X_LIBS) $(PTHREAD_LIBS) $(FT_LDFLAGS) $(FT_LIBS) \
 			$(XML2_LIBS) $(JPEG_LIBS) $(LIBLO_LIBS) $(LIBUNWIND_LIBS) $(GLIB_LIBS) \
 		 	$(FFMPEG_LIBS) $(XINERAMA_LIBS) $(MJPEGTOOLS_LIBS) $(LIBPNG_LIBS) \
--- a/veejay-current/veejay-server/veejay/veejay.c
+++ b/veejay-current/veejay-server/veejay/veejay.c
@@ -788,7 +788,7 @@ int main(int argc, char **argv)
 	{
 		veejay_free(info);
 		return 0;
-    }
+ 	}
 	print_license();