refactor so gcc can auto vectorize motion mapping

2026-01-06 15:05:28 +01:00 · 2016-03-19 16:40:32 +01:00
parent 443742ad3c
commit ac35bb86c3
1 changed files with 91 additions and 62 deletions
--- a/veejay-current/veejay-server/libvje/effects/motionmap.c
+++ b/veejay-current/veejay-server/libvje/effects/motionmap.c
@@ -27,17 +27,11 @@
 	 p4 = Decay
 */

-/* REVIEW / FIXME ?:
- 
-   This filter detects the amount of motion in a frame. It keeps an internal
-   buffer to average (smoothen) the acitivity levels over N frames 
-   At each step in N , a new value is linearly interpolated which is later 
-   pulled by other FX to override their parameter values.
-   To compensate for jumpy video, the frames n+1 to N are linearly interpolated
-   from frame n+0 to frame N automatically.
-
-
+/*
+ * This FX relies on gcc's auto vectorization.
+ * To use the plain C version, define NO_AUTOVECTORIZATION
 */
+
 #include <config.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -88,6 +82,7 @@ static int32_t histogram_[HIS_LEN];

 static uint8_t *bg_image = NULL;
 static uint8_t *binary_img = NULL;
+static uint8_t *diff_img = NULL;
 static uint8_t *prev_img = NULL;
 static uint8_t *interpolate_buf = NULL;

@@ -98,6 +93,7 @@ static int current_decay = HIS_DEFAULT;
 static uint32_t	key1_ = 0, key2_ = 0, keyv_ = 0, keyp_ = 0;
 static int have_bg = 0;
 static int running = 0;
+static int is_initialized = 0;

 int		motionmap_malloc(int w, int h )
 {
@@ -105,7 +101,8 @@ int		motionmap_malloc(int w, int h )
 	binary_img = (uint8_t*) vj_malloc(sizeof(uint8_t) * RUP8(w * h)); 
 	prev_img = (uint8_t*) vj_malloc(sizeof(uint8_t) * RUP8(w*h));
 	interpolate_buf = vj_malloc( sizeof(uint8_t) * RUP8(w*h*3));
-	
+	diff_img = (uint8_t*) vj_malloc( sizeof(uint8_t) * RUP8(w*h*2));
+
 	veejay_msg(2, "This is 'Motion Mapping'");
 	veejay_msg(2, "This FX calculates motion energy activity levels over a period of time to scale FX parameters");
 	veejay_msg(2, "Add any of the following to the FX chain (if not already present)");
@@ -114,6 +111,9 @@ int		motionmap_malloc(int w, int h )
 	veejay_memset( histogram_, 0, sizeof(int32_t) * HIS_LEN );
 	nframe_ = 0;
 	running = 0;
+
+	is_initialized ++;
+
 	return 1;
 }

@@ -127,6 +127,11 @@ void		motionmap_free(void)
 		free(binary_img);
 	if( prev_img )
 		free(prev_img);
+	if( diff_img )
+		free(diff_img);
+
+	if( is_initialized > 0 )
+		is_initialized --;

 	have_bg = 0;
 	interpolate_buf = NULL;
@@ -197,12 +202,25 @@ void	motionmap_lerp_frame( VJFrame *cur, VJFrame *prev, int N, int n )
    uint8_t *__restrict__ V0 = cur->data[2];
    const uint8_t *__restrict__ V1 = prev->data[2];

-    for ( i = 0; i < len ; i ++ )
-    {
+#ifndef NO_AUTOVECTORIZATION
+    for ( i = 0; i < len ; i ++ ) {
+    	Y0[i] = Y1[i] + ( frac * (Y0[i] - Y1[i]));
+	}
+
+	for( i = 0; i < len; i ++ ) {
+        U0[i] = U1[i] + ( frac * (U0[i] - U1[i]));
+	}
+	
+  	for( i = 0; i < len; i ++ ) {	
+		V0[i] = V1[i] + ( frac * (V0[i] - V1[i]));
+    }
+#else
+ 	for ( i = 0; i < len ; i ++ ) {
    	Y0[i] = Y1[i] + ( frac * (Y0[i] - Y1[i]));
        U0[i] = U1[i] + ( frac * (U0[i] - U1[i]));
-        V0[i] = V1[i] + ( frac * (V0[i] - V1[i]));
+		V0[i] = V1[i] + ( frac * (V0[i] - V1[i]));
    }
+#endif
 }

 void	motionmap_store_frame( VJFrame *fx )
@@ -243,19 +261,47 @@ static int32_t motionmap_activity_level( uint8_t *I, int width, int height )
 	const unsigned int len = (width * height);
 	int32_t level = 0;
 	int r,c;
-    for (r = 0; r < len; r += width) {
+   	for (r = 0; r < len; r += width) {
 		for ( c = 0; c < width; c ++ ) {
-			if( I[r + c] > 0 )
-					level ++;
+			level += I[r + c];
 		}
 	}
-	return level;
+	return (level>>8);
 }

-void motionmap_find_diff( uint8_t *bg, uint8_t *prev_img, uint8_t *img, const int len, const int threshold )
+void motionmap_calc_diff( const uint8_t *bg, uint8_t *prev_img, const uint8_t *img, uint8_t *pI1, uint8_t *pI2, uint8_t *bDst, const int len, const int threshold )
 {
 	unsigned int i;
 	uint8_t p1,p2;
+
+#ifndef NO_AUTOVECTORIZATION
+	uint8_t *I1 = __builtin_assume_aligned( pI1, 16 );
+	uint8_t *I2 = __builtin_assume_aligned( pI2, 16 );
+
+	for( i = 0; i < len; i ++ ) 
+	{
+		I1[i] = abs( bg[i] - img[i] );
+		if( I1[i] < threshold )
+			I1[i] = 0;
+		else
+			I1[i] = 0xff;
+
+		I2[i] = abs( bg[i] - prev_img[i] );
+		if( I2[i] < threshold )
+			I2[i] = 0; 
+		else
+			I2[i] = 0xff;
+
+		I1[i] = abs( I1[i] - I2[i] );
+		I2[i] = bDst[i] >> 1;
+	}
+
+	for( i = 0; i < len; i ++ ) 
+	{
+		bDst[i] = I1[i] + I2[i];
+		prev_img[i] = img[i];
+	}
+#else
 	for( i = 0; i < len; i ++ ) 
 	{
 		uint8_t q1 = 0, q2 = 0;
@@ -270,56 +316,39 @@ void motionmap_find_diff( uint8_t *bg, uint8_t *prev_img, uint8_t *img, const in
 		}

 		if( (!q1 && q2) || (!q2 && q1) ) {
-			binary_img[i] = 0xff;
+			bDst[i] = 0xff;
 		}
 		else {
-			binary_img[i] = (binary_img[i] >> 1); //@ decay
+			bDst[i] = (bDst[i] >> 1); //@ decay
 		}

-		prev_img[i] = img[i];
+		prev_img[i] = img[i]; 
 	}
+#endif
 }

 void motionmap_find_diff_job( void *arg )
 {
 	vj_task_arg_t *t = (vj_task_arg_t*) arg;

-	uint8_t *t_bg = t->input[0];
-	uint8_t *t_img = t->input[1];
+	const uint8_t *t_bg = t->input[0];
+	const uint8_t *t_img = t->input[1];
 	uint8_t *t_prev_img = t->input[2];
 	uint8_t *t_binary_img = t->output[0];
+	uint8_t *t_diff1 = t->output[1];
+	uint8_t *t_diff2 = t->output[2];

 	const int len = t->strides[0];
 	const int threshold = t->iparams[0];

-	unsigned int i;
-	uint8_t p1,p2;
-	for( i = 0; i < len; i ++ ) 
-	{
-		uint8_t q1 = 0, q2 = 0;
-		p1 = abs( t_bg[i] - t_img[i] );
-		if( p1 > threshold ) {
-			q1 = 1;
-		}
-
-		p2 = abs( t_bg[i] - t_prev_img[i] );
-		if( p2 > threshold ) {
-			q2 = 1;
-		}
-
-		if( (!q1 && q2) || (!q2 && q1) ) {
-			t_binary_img[i] = 0xff;
-		}
-		else {
-			t_binary_img[i] = (t_binary_img[i] >> 1); //@ decay in 7 frames
-		}
-
-		t_prev_img[i] = t_img[i];
-	}
+	motionmap_calc_diff( t_bg, t_prev_img, t_img,t_diff1,t_diff2, t_binary_img, len, threshold );
 }

 int	motionmap_prepare( uint8_t *map[4], int width, int height )
 {
+	if(!is_initialized)
+			return 0;
+
 	vj_frame_copy1( map[0], bg_image, width * height );
 	motionmap_blur( bg_image, width,height );
 	veejay_memcpy( prev_img, bg_image, width * height );
@@ -342,35 +371,35 @@ void motionmap_apply( VJFrame *frame, int width, int height, int threshold, int
 		return;
 	}

-	/* run difference algorithm over multiple threads */
+	// run difference algorithm over multiple threads
 	if( vj_task_available() ) {
 		VJFrame task;
-		task.stride[0] = len;		   /* plane length */
+		task.stride[0] = len;		   // plane length 
 		task.stride[1] = len;
 		task.stride[2] = len;
 		task.stride[3] = 0;
-		task.data[0] = bg_image;       /* plane 0 = background image */
-		task.data[1] = frame->data[0]; /* plane 1 = luminance channel */
-		task.data[2] = prev_img;       /* plane 2 = luminance channel of previous frame */
+		task.data[0] = bg_image;       // plane 0 = background image 
+		task.data[1] = frame->data[0]; // plane 1 = luminance channel 
+		task.data[2] = prev_img;       // plane 2 = luminance channel of previous frame
 		task.data[3] = NULL;
-		task.ssm = 1;                  /* all planes are the same size */
-		task.format = frame->format;   /* not important, but cannot be 0 */
+		task.ssm = 1;                  // all planes are the same size 
+		task.format = frame->format;   // not important, but cannot be 0
 		task.shift_v = 0;
 		task.shift_h = 0;
-		task.uv_width = 0;
-		task.uv_height = 0;
-		task.width = width;            /* dimensions */
+		task.uv_width = width;
+		task.uv_height = height;
+		task.width = width;            // dimension
 		task.height = height;

-		uint8_t *dst[4] = { binary_img, binary_img, binary_img, NULL };
+		uint8_t *dst[4] = { binary_img, diff_img, diff_img + RUP8(len), NULL };

 		vj_task_set_from_frame( &task );
 		vj_task_set_param( threshold, 0 );

 		vj_task_run( task.data, dst, NULL,NULL,3, (performer_job_routine) &motionmap_find_diff_job );
 	}
-	else {
-		motionmap_find_diff( bg_image, prev_img, frame->data[0], len, threshold );
+	else { 
+		motionmap_calc_diff( (const uint8_t*) bg_image, prev_img, (const uint8_t*) frame->data[0], diff_img, diff_img + RUP8(len), binary_img, len, threshold );
 	}

 	if( draw )
@@ -387,7 +416,7 @@ void motionmap_apply( VJFrame *frame, int width, int height, int threshold, int
 	int32_t min = INT_MAX;

 	current_his_len = history;
-	current_decay   = decay;
+	current_decay = decay;

 	histogram_[ (nframe_%current_his_len) ] = activity_level;