auto vectorization wins

This commit is contained in:
niels
2015-06-02 21:21:13 +02:00
parent bc95c72c3f
commit 00c84f21f1
2 changed files with 29 additions and 141 deletions

View File

@@ -23,6 +23,9 @@
#include <libvjmem/vjmem.h>
#include "negatechannel.h"
#include "common.h"
#undef HAVE_ASM_MMX
vj_effect *negatechannel_init(int w, int h)
{
vj_effect *ve = (vj_effect *) vj_calloc(sizeof(vj_effect));
@@ -56,92 +59,21 @@ void negatechannel_apply( VJFrame *frame, int width, int height, int chan, int v
uint8_t *Cb = frame->data[1];
uint8_t *Cr = frame->data[2];
#ifndef HAVE_ASM_MMX
switch( chan ) {
case 0:
for (i = 0; i < len; i++) {
*(Y) = val - *(Y);
*(Y)++;
Y[i] = val - Y[i];
}
break;
case 1:
for (i = 0; i < uv_len; i++) {
*(Cb) = val - *(Cb);
*(Cb)++;
Cb[i] = val - Cb[i];
}
break;
case 2:
for (i = 0; i < uv_len; i++) {
*(Cr) = val - *(Cr);
*(Cr)++;
Cr[i] = val - Cr[i];
}
break;
}
#else
int left = len % 8;
int work= len >> 3;
vje_load_mask(val);
switch( chan ) {
case 0:
for( i = 0; i < work ; i ++ )
{
vje_mmx_negate( Y, Y );
Y += 8;
}
if (left )
{
for( i = 0; i < left; i ++ )
{
*(Y) = val - *(Y);
*(Y)++;
}
}
break;
case 1:
work = uv_len >> 3;
left = uv_len % 8;
for( i = 0; i < work ; i ++ )
{
vje_mmx_negate( Cb, Cb );
Cr += 8;
}
if(left )
{
for( i = 0; i < left; i ++ )
{
*(Cb) = val - *(Cb);
*(Cb)++;
}
}
break;
case 2:
work = uv_len >> 3;
left = uv_len % 8;
for( i = 0; i < work ; i ++ )
{
vje_mmx_negate( Cr, Cr );
Cr += 8;
}
if(left )
{
for( i = 0; i < left; i ++ )
{
*(Cr) = val - *(Cr);
*(Cr)++;
}
}
break;
}
do_emms;
#endif
}

View File

@@ -23,6 +23,10 @@
#include <libvjmem/vjmem.h>
#include "common.h"
#include "negation.h"
//auto vectorization is better
#undef HAVE_ASM_MMX
vj_effect *negation_init(int w, int h)
{
vj_effect *ve = (vj_effect *) vj_calloc(sizeof(vj_effect));
@@ -54,61 +58,13 @@ void negation_apply( VJFrame *frame, int width, int height, int val)
uint8_t *Cb = frame->data[1];
uint8_t *Cr = frame->data[2];
#ifndef HAVE_ASM_MMX
for (i = 0; i < len; i++) {
*(Y) = val - *(Y);
*(Y)++;
for( i = 0; i < len; i ++ ) {
Y[i] = val - Y[i];
}
for (i = 0; i < uv_len; i++) {
*(Cb) = val - *(Cb);
*(Cb)++;
*(Cr) = val - *(Cr);
*(Cr)++;
}
#else
int left = len % 8;
int work= len >> 3;
vje_load_mask(val);
for( i = 0; i < work ; i ++ )
{
vje_mmx_negate( Y, Y );
Y += 8;
for( i = 0; i < uv_len; i ++ ) {
Cb[i] = val - Cb[i];
Cr[i] = val - Cr[i];
}
if (left )
{
for( i = 0; i < left; i ++ )
{
*(Y) = val - *(Y);
*(Y)++;
}
}
work = uv_len >> 3;
left = uv_len % 8;
for( i = 0; i < work ; i ++ )
{
vje_mmx_negate( Cb, Cb );
vje_mmx_negate( Cr, Cr );
Cb += 8;
Cr += 8;
}
if(left )
{
for( i = 0; i < left; i ++ )
{
*(Cb) = val - *(Cb);
*(Cb)++;
*(Cr) = val - *(Cr);
*(Cr)++;
}
}
do_emms;
#endif
}