mirror of
https://github.com/game-stop/veejay.git
synced 2025-12-18 13:49:58 +01:00
1238 lines
29 KiB
C
1238 lines
29 KiB
C
/* motion.c, motion estimation */
|
||
|
||
|
||
/* (C) 2000/2001 Andrew Stevens */
|
||
|
||
/* These modifications are free software; you can redistribute it
|
||
* and/or modify it under the terms of the GNU General Public License
|
||
* as published by the Free Software Foundation; either version 2 of
|
||
* the License, or (at your option) any later version.
|
||
*
|
||
* This program is distributed in the hope that it will be useful,
|
||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
* General Public License for more details.
|
||
*
|
||
* You should have received a copy of the GNU General Public License
|
||
* along with this program; if not, write to the Free Software
|
||
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
||
* 02111-1307, USA.
|
||
*
|
||
*/
|
||
|
||
#include <config.h>
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <limits.h>
|
||
#include "math.h"
|
||
#include "cpu_accel.h"
|
||
#include "fastintfns.h"
|
||
#include "motionsearch.h"
|
||
#include "mjpeg_logging.h"
|
||
|
||
|
||
#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM)
|
||
|
||
#include "mblock_sub44_sads_x86.h"
|
||
|
||
static int (*pmblocks_sub44_mests)( uint8_t *blk, uint8_t *ref,
|
||
int ilow, int jlow,
|
||
int ihigh, int jhigh,
|
||
int h, int rowstride,
|
||
int threshold,
|
||
me_result_s *resvec);
|
||
|
||
|
||
void mblock_sub22_nearest4_sads_mmxe(uint8_t *blk1,uint8_t *blk2,
|
||
int frowstride,int fh, int* resvec)
|
||
__asm__ ("mblock_sub22_nearest4_sads_mmxe");
|
||
|
||
void mblock_nearest4_sads_mmxe(uint8_t *blk1, uint8_t *blk2,
|
||
int rowstride, int h, int *resvec)
|
||
__asm__ ("mblock_nearest4_sads_mmxe");
|
||
|
||
int sad_00_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h, int distlim) __asm__ ("sad_00_mmxe");
|
||
int sad_01_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_01_mmxe");
|
||
int sad_10_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_10_mmxe");
|
||
int sad_11_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_11_mmxe");
|
||
|
||
|
||
int sad_sub22_mmxe ( uint8_t *blk1, uint8_t *blk2, int frowstride, int fh) __asm__ ("sad_sub22_mmxe");
|
||
int sad_sub44_mmxe ( uint8_t *blk1, uint8_t *blk2, int qrowstride, int qh) __asm__ ("sad_sub44_mmxe");
|
||
int sumsq_mmx( uint8_t *blk1, uint8_t *blk2,
|
||
int rowstride, int hx, int hy, int h) __asm__ ("sumsq_mmx");
|
||
int sumsq_sub22_mmx( uint8_t *blk1, uint8_t *blk2,
|
||
int rowstride, int h) __asm__ ("sumsq_sub22_mmx");
|
||
int bsumsq_sub22_mmx( uint8_t *blk1f, uint8_t *blk1b,
|
||
uint8_t *blk2,
|
||
int rowstride, int h) __asm__ ("bsumsq_sub22_mmx");
|
||
int bsumsq_mmx (uint8_t *pf, uint8_t *pb,
|
||
uint8_t *p2, int rowstride,
|
||
int hxf, int hyf, int hxb, int hyb, int h) __asm__ ("bsumsq_mmx");
|
||
int bsad_mmx (uint8_t *pf, uint8_t *pb,
|
||
uint8_t *p2, int rowstride,
|
||
int hxf, int hyf, int hxb, int hyb, int h) __asm__ ("bsad_mmx");
|
||
|
||
int variance_mmx( uint8_t *p, int size, int rowstride) __asm__ ("variance_mmx");
|
||
|
||
int sad_00_mmx ( uint8_t *blk1, uint8_t *blk2, int rowstride, int h, int distlim) __asm__ ("sad_00_mmx");
|
||
int sad_01_mmx(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_01_mmx");
|
||
int sad_10_mmx(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_10_mmx");
|
||
int sad_11_mmx(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_11_mmx");
|
||
int sad_sub22_mmx ( uint8_t *blk1, uint8_t *blk2, int frowstride, int fh) __asm__ ("sad_sub22_mmx");
|
||
int sad_sub44_mmx (uint8_t *blk1, uint8_t *blk2, int qrowstride, int qh) __asm__ ("sad_sub44_mmx");
|
||
|
||
#endif
|
||
|
||
|
||
/*
|
||
* Function pointers for selecting CPU specific implementations
|
||
*
|
||
*/
|
||
|
||
void (*pfind_best_one_pel)( me_result_set *sub22set,
|
||
uint8_t *org, uint8_t *blk,
|
||
int i0, int j0,
|
||
int ihigh, int jhigh,
|
||
int rowstride, int h,
|
||
me_result_s *res
|
||
);
|
||
int (*pbuild_sub22_mests)( me_result_set *sub44set,
|
||
me_result_set *sub22set,
|
||
int i0, int j0, int ihigh, int jhigh,
|
||
int null_mc_sad,
|
||
uint8_t *s22org, uint8_t *s22blk,
|
||
int frowstride, int fh,
|
||
int reduction
|
||
);
|
||
|
||
int (*pbuild_sub44_mests)( me_result_set *sub44set,
|
||
int ilow, int jlow, int ihigh, int jhigh,
|
||
int i0, int j0,
|
||
int null_mc_sad,
|
||
uint8_t *s44org, uint8_t *s44blk,
|
||
int qrowstride, int qh,
|
||
int reduction );
|
||
|
||
int (*psumsq_sub22)( uint8_t *blk1, uint8_t *blk2,
|
||
int rowstride, int h);
|
||
int (*pbsumsq_sub22)( uint8_t *blk1f, uint8_t *blk1b,
|
||
uint8_t *blk2,
|
||
int rowstride, int h);
|
||
|
||
int (*pvariance)(uint8_t *mb, int size, int rowstride);
|
||
|
||
|
||
int (*psad_sub22) ( uint8_t *blk1, uint8_t *blk2, int frowstride, int fh);
|
||
int (*psad_sub44) ( uint8_t *blk1, uint8_t *blk2, int qrowstride, int qh);
|
||
int (*psad_00) ( uint8_t *blk1, uint8_t *blk2, int rowstride, int h, int distlim);
|
||
int (*psad_01) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h);
|
||
int (*psad_10) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h);
|
||
int (*psad_11) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h);
|
||
|
||
int (*psumsq) (uint8_t *blk1, uint8_t *blk2,
|
||
int rowstride, int hx, int hy, int h);
|
||
|
||
|
||
int (*pbsumsq) (uint8_t *pf, uint8_t *pb,
|
||
uint8_t *p2, int rowstride, int hxf, int hyf, int hxb, int hyb, int h);
|
||
|
||
int (*pbsad) (uint8_t *pf, uint8_t *pb,
|
||
uint8_t *p2, int rowstride, int hxf, int hyf, int hxb, int hyb, int h);
|
||
|
||
|
||
|
||
|
||
|
||
/*
|
||
* Round search radius to suit the search algorithm.
|
||
* Currently radii must be multiples of 8.
|
||
*
|
||
*/
|
||
|
||
int round_search_radius( int radius )
|
||
{
|
||
return intmax(8,((radius+4) /8)*8);
|
||
}
|
||
|
||
|
||
|
||
/*
|
||
Take a vector of motion estimations and repeatedly make passes
|
||
discarding all elements whose sad "weight" is above the current mean weight.
|
||
*/
|
||
|
||
static void sub_mean_reduction( me_result_set *matchset,
|
||
int times,
|
||
int *minweight_res)
|
||
{
|
||
me_result_s *matches = matchset->mests;
|
||
int len = matchset->len;
|
||
int i,j;
|
||
int weight_sum;
|
||
int mean_weight;
|
||
int min_weight = 100000;
|
||
if( len == 0 )
|
||
{
|
||
*minweight_res = 100000;
|
||
matchset->len = 0;
|
||
return;
|
||
}
|
||
|
||
for(;;)
|
||
{
|
||
weight_sum = 0;
|
||
for( i = 0; i < len ; ++i )
|
||
weight_sum += matches[i].weight;
|
||
mean_weight = weight_sum / len;
|
||
|
||
if( times <= 0)
|
||
break;
|
||
|
||
j = 0;
|
||
for( i =0; i < len; ++i )
|
||
{
|
||
if( matches[i].weight <= mean_weight )
|
||
{
|
||
if( times == 1)
|
||
{
|
||
min_weight = matches[i].weight ;
|
||
}
|
||
matches[j] = matches[i];
|
||
++j;
|
||
}
|
||
}
|
||
len = j;
|
||
--times;
|
||
}
|
||
matchset->len = len;
|
||
*minweight_res = mean_weight;
|
||
}
|
||
|
||
/*
|
||
* Build a vector of the top 4*4 sub-sampled motion estimations in
|
||
* the box (ilow,jlow) to (ihigh,jhigh).
|
||
*
|
||
* The algorithm is as follows:
|
||
*
|
||
* 1. Matches on an 4*4 pel grid are collected. All those matches
|
||
* whose that is over a (conservative) threshold (basically 50% more
|
||
* than moving average of the mean sad of such matches) are discarded.
|
||
*
|
||
* 2. Multiple passes are made discarding worse than-average matches.
|
||
* The number of passes is specified by the user. The default it 2
|
||
* (leaving roughly 1/4 of the matches).
|
||
*
|
||
* The intial threshold and discard passes are controlled by reduction
|
||
* [1..4]. The intial SAD threshold is calculated as 6 / reduction of
|
||
* a reference SAD passed as a parameter. For reduction == 1 one
|
||
* discard pass is made otherwise two are made.
|
||
*
|
||
* The net result is very fast and find good matches if they're to be
|
||
* found. I.e. the penalty over exhaustive search is pretty low.
|
||
*
|
||
* NOTE: The "discard below average" trick depends critically on
|
||
* having some variation in the matches. The slight penalty imposed
|
||
* for distant matches (reasonable since the motion vectors have to
|
||
* be encoded) is *vital* as otherwise pathologically bad performance
|
||
* results on highly uniform images.
|
||
*
|
||
* TODO: We should probably allow the user to eliminate the initial
|
||
* thinning of 4*4 grid matches if ultimate quality is demanded
|
||
* (e.g. for low bit-rate applications).
|
||
*
|
||
*/
|
||
|
||
static int build_sub44_mests( me_result_set *sub44set,
|
||
int ilow, int jlow, int ihigh, int jhigh,
|
||
int i0, int j0,
|
||
int null_ctl_sad,
|
||
uint8_t *s44org, uint8_t *s44blk,
|
||
int qrowstride, int qh,
|
||
int reduction
|
||
)
|
||
{
|
||
uint8_t *s44orgblk;
|
||
me_result_s *sub44_mests = sub44set->mests;
|
||
int istrt = ilow-i0;
|
||
int jstrt = jlow-j0;
|
||
int iend = ihigh-i0;
|
||
int jend = jhigh-j0;
|
||
int mean_weight;
|
||
int threshold;
|
||
|
||
int i,j;
|
||
int s1;
|
||
uint8_t *old_s44orgblk;
|
||
int sub44_num_mests;
|
||
|
||
/* N.b. we may ignore the right hand block of the pair going over the
|
||
right edge as we have carefully allocated the buffer oversize to ensure
|
||
no memory faults. The later motion estimation calculations
|
||
performed on the results of this pass will filter out
|
||
out-of-range blocks...
|
||
*/
|
||
|
||
threshold = 6*null_ctl_sad / (4*4*reduction);
|
||
s44orgblk = s44org+(ilow>>2)+qrowstride*(jlow>>2);
|
||
|
||
/* Exhaustive search on 4*4 sub-sampled data. This is affordable because
|
||
(a) it is only 16th of the size of the real 1-pel data
|
||
(b) we ignore those matches with an sad above our threshold.
|
||
*/
|
||
|
||
sub44_num_mests = 0;
|
||
|
||
/* Invariant: s44orgblk = s44org+(i>>2)+qrowstride*(j>>2) */
|
||
s44orgblk = s44org+(ilow>>2)+qrowstride*(jlow>>2);
|
||
for( j = jstrt; j <= jend; j += 4 )
|
||
{
|
||
old_s44orgblk = s44orgblk;
|
||
for( i = istrt; i <= iend; i += 4 )
|
||
{
|
||
s1 = ((*psad_sub44)( s44orgblk,s44blk,qrowstride,qh) & 0xffff);
|
||
if( s1 < threshold )
|
||
{
|
||
threshold = intmin(s1<<2,threshold);
|
||
sub44_mests[sub44_num_mests].x = i;
|
||
sub44_mests[sub44_num_mests].y = j;
|
||
sub44_mests[sub44_num_mests].weight = s1 +
|
||
(intmax(intabs(i-i0),intabs(j-j0))<<1);
|
||
++sub44_num_mests;
|
||
}
|
||
s44orgblk += 1;
|
||
}
|
||
s44orgblk = old_s44orgblk + qrowstride;
|
||
}
|
||
sub44set->len = sub44_num_mests;
|
||
|
||
sub_mean_reduction( sub44set, 1+(reduction>1), &mean_weight);
|
||
|
||
|
||
return sub44set->len;
|
||
}
|
||
|
||
#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM)
|
||
|
||
static int build_sub44_mests_mmx( me_result_set *sub44set,
|
||
int ilow, int jlow, int ihigh, int jhigh,
|
||
int i0, int j0,
|
||
int null_ctl_sad,
|
||
uint8_t *s44org, uint8_t *s44blk,
|
||
int qrowstride, int qh,
|
||
int reduction)
|
||
{
|
||
uint8_t *s44orgblk;
|
||
me_result_s *sub44_mests = sub44set->mests;
|
||
int istrt = ilow-i0;
|
||
int jstrt = jlow-j0;
|
||
int iend = ihigh-i0;
|
||
int jend = jhigh-j0;
|
||
int mean_weight;
|
||
int threshold;
|
||
|
||
|
||
threshold = 6*null_ctl_sad / (4*4*reduction);
|
||
s44orgblk = s44org+(ilow>>2)+qrowstride*(jlow>>2);
|
||
|
||
sub44set->len = (*pmblocks_sub44_mests)( s44orgblk, s44blk,
|
||
istrt, jstrt,
|
||
iend, jend,
|
||
qh, qrowstride,
|
||
threshold,
|
||
sub44_mests);
|
||
|
||
/* If we're really pushing quality we reduce once otherwise twice. */
|
||
|
||
sub_mean_reduction( sub44set, 1+(reduction>1), &mean_weight);
|
||
|
||
|
||
return sub44set->len;
|
||
}
|
||
#endif
|
||
|
||
|
||
|
||
/* Build a vector of the best 2*2 sub-sampled motion estimations for
|
||
* the 16*16 macroblock at i0,j0 using a set of 4*4 sub-sampled matches as
|
||
* starting points. As with with the 4*4 matches We don't collect
|
||
* them densely as they're just search starting points for 1-pel
|
||
* search and ones that are 1 out * should still give better than
|
||
* average matches...
|
||
*
|
||
* The resulting candidate motion vectors are thinned by thresholding
|
||
* and discarding worse than-average matches.
|
||
*
|
||
* The intial threshold and number of discard passes are controlled by reduction
|
||
* [1..4]: the intial SAD threshold is calculated as 6 / reduction of
|
||
* a reference SAD passed as a parameter and then reduction discard passes
|
||
* are made.
|
||
*
|
||
* A super-fast version using MMX assembly code for X86 follows.
|
||
* Other CPU's could/should be handled the same way. */
|
||
|
||
|
||
static int build_sub22_mests( me_result_set *sub44set,
|
||
me_result_set *sub22set,
|
||
int i0, int j0, int ihigh, int jhigh,
|
||
int null_ctl_sad,
|
||
uint8_t *s22org, uint8_t *s22blk,
|
||
int frowstride, int fh,
|
||
int reduction)
|
||
{
|
||
int i,k,s;
|
||
int threshold = 6*null_ctl_sad / (2 * 2*reduction);
|
||
|
||
int min_weight;
|
||
int ilim = ihigh-i0;
|
||
int jlim = jhigh-j0;
|
||
int x,y;
|
||
uint8_t *s22orgblk;
|
||
|
||
sub22set->len = 0;
|
||
for( k = 0; k < sub44set->len; ++k )
|
||
{
|
||
|
||
x = sub44set->mests[k].x;
|
||
y = sub44set->mests[k].y;
|
||
|
||
s22orgblk = s22org +((y+j0)>>1)*frowstride +((x+i0)>>1);
|
||
for( i = 0; i < 4; ++i )
|
||
{
|
||
if( x <= ilim && y <= jlim )
|
||
{
|
||
s = (*psad_sub22)( s22orgblk,s22blk,frowstride,fh)+
|
||
(intmax(intabs(x),intabs(y))<<3);
|
||
if( s < threshold )
|
||
{
|
||
me_result_s *mc = &sub22set->mests[sub22set->len];
|
||
mc->x = (int8_t)x;
|
||
mc->y = (int8_t)y;
|
||
mc->weight = s;
|
||
++(sub22set->len);
|
||
}
|
||
}
|
||
|
||
if( i == 1 )
|
||
{
|
||
s22orgblk += frowstride-1;
|
||
x -= 2;
|
||
y += 2;
|
||
}
|
||
else
|
||
{
|
||
s22orgblk += 1;
|
||
x += 2;
|
||
|
||
}
|
||
}
|
||
|
||
}
|
||
|
||
sub_mean_reduction( sub22set, reduction, &min_weight );
|
||
return sub22set->len;
|
||
}
|
||
|
||
#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM)
|
||
static int build_sub22_mests_mmxe( me_result_set *sub44set,
|
||
me_result_set *sub22set,
|
||
int i0, int j0, int ihigh, int jhigh,
|
||
int null_ctl_sad,
|
||
uint8_t *s22org, uint8_t *s22blk,
|
||
int frowstride, int fh,
|
||
int reduction)
|
||
{
|
||
int i,k,s;
|
||
int threshold = 6*null_ctl_sad / (2 * 2*reduction);
|
||
|
||
int min_weight;
|
||
int ilim = ihigh-i0;
|
||
int jlim = jhigh-j0;
|
||
int x,y;
|
||
uint8_t *s22orgblk;
|
||
int resvec[4];
|
||
|
||
/* TODO: The calculation of the lstrow offset really belongs in
|
||
asm code... */
|
||
int lstrow=(fh-1)*frowstride;
|
||
|
||
sub22set->len = 0;
|
||
for( k = 0; k < sub44set->len; ++k )
|
||
{
|
||
|
||
x = sub44set->mests[k].x;
|
||
y = sub44set->mests[k].y;
|
||
|
||
s22orgblk = s22org +((y+j0)>>1)*frowstride +((x+i0)>>1);
|
||
/*
|
||
Get SAD for 2*2 subsampled macroblocks: orgblk,orgblk(+2,0),
|
||
orgblk(0,+2), and orgblk(+2,+2) Done all in one go to reduce
|
||
memory bandwidth demand
|
||
*/
|
||
mblock_sub22_nearest4_sads_mmxe(s22orgblk+lstrow, s22blk+lstrow, frowstride, fh, resvec);
|
||
for( i = 0; i < 4; ++i )
|
||
{
|
||
if( x <= ilim && y <= jlim )
|
||
{
|
||
s =resvec[i]+(intmax(intabs(x),intabs(y))<<3);
|
||
if( s < threshold )
|
||
{
|
||
me_result_s *mc = &sub22set->mests[sub22set->len];
|
||
mc->x = (int8_t)x;
|
||
mc->y = (int8_t)y;
|
||
mc->weight = s;
|
||
++(sub22set->len);
|
||
}
|
||
}
|
||
|
||
if( i == 1 )
|
||
{
|
||
x -= 2;
|
||
y += 2;
|
||
}
|
||
else
|
||
{
|
||
x += 2;
|
||
}
|
||
}
|
||
|
||
}
|
||
|
||
|
||
sub_mean_reduction( sub22set, reduction, &min_weight );
|
||
return sub22set->len;
|
||
}
|
||
|
||
#endif
|
||
|
||
/*
|
||
* Search for the best 1-pel match within 1-pel of a good 2*2-pel
|
||
* match.
|
||
*
|
||
* N.b. best_so_far must be initialised by the caller!
|
||
*/
|
||
|
||
|
||
static void find_best_one_pel( me_result_set *sub22set,
|
||
uint8_t *org, uint8_t *blk,
|
||
int i0, int j0,
|
||
int ihigh, int jhigh,
|
||
int rowstride, int h,
|
||
me_result_s *best_so_far
|
||
)
|
||
|
||
{
|
||
int i,k;
|
||
int d;
|
||
me_result_s minpos = *best_so_far;
|
||
int dmin = INT_MAX;
|
||
int ilim = ihigh-i0;
|
||
int jlim = jhigh-j0;
|
||
uint8_t *orgblk;
|
||
int penalty;
|
||
me_result_s matchrec;
|
||
|
||
for( k = 0; k < sub22set->len; ++k )
|
||
{
|
||
|
||
matchrec = sub22set->mests[k];
|
||
orgblk = org + (i0+matchrec.x)+rowstride*(j0+matchrec.y);
|
||
penalty = intmax(intabs(matchrec.x),intabs(matchrec.y))<<5;
|
||
|
||
for( i = 0; i < 4; ++i )
|
||
{
|
||
if( matchrec.x <= ilim && matchrec.y <= jlim )
|
||
{
|
||
|
||
d = penalty+(*psad_00)(orgblk,blk,rowstride,h, dmin);
|
||
if (d<dmin)
|
||
{
|
||
dmin = d;
|
||
minpos = matchrec;
|
||
}
|
||
}
|
||
if( i == 1 )
|
||
{
|
||
orgblk += rowstride-1;
|
||
matchrec.x -= 1;
|
||
matchrec.y += 1;
|
||
}
|
||
else
|
||
{
|
||
orgblk += 1;
|
||
matchrec.x += 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
minpos.weight = (uint16_t)intmin(255*255, dmin);
|
||
*best_so_far = minpos;
|
||
}
|
||
|
||
#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM)
|
||
static void find_best_one_pel_mmxe( me_result_set *sub22set,
|
||
uint8_t *org, uint8_t *blk,
|
||
int i0, int j0,
|
||
int ihigh, int jhigh,
|
||
int rowstride, int h,
|
||
me_result_s *best_so_far
|
||
)
|
||
|
||
{
|
||
int i,k;
|
||
int d;
|
||
me_result_s minpos = *best_so_far;
|
||
int ilim = ihigh-i0;
|
||
int jlim = jhigh-j0;
|
||
int dmin = INT_MAX;
|
||
uint8_t *orgblk;
|
||
int penalty;
|
||
me_result_s matchrec;
|
||
int resvec[4];
|
||
|
||
for( k = 0; k < sub22set->len; ++k )
|
||
{
|
||
matchrec = sub22set->mests[k];
|
||
orgblk = org + (i0+matchrec.x)+rowstride*(j0+matchrec.y);
|
||
penalty = intmax(abs(matchrec.x),abs(matchrec.y))<<5;
|
||
|
||
/* Get SAD for macroblocks: orgblk,orgblk(+1,0),
|
||
orgblk(0,+1), and orgblk(+1,+1)
|
||
Done all in one go to reduce memory bandwidth demand
|
||
*/
|
||
mblock_nearest4_sads_mmxe(orgblk,blk,rowstride,h,
|
||
resvec);
|
||
|
||
for( i = 0; i < 4; ++i )
|
||
{
|
||
if( matchrec.x <= ilim && matchrec.y <= jlim )
|
||
{
|
||
|
||
d = penalty+resvec[i];
|
||
if (d<dmin)
|
||
{
|
||
dmin = d;
|
||
minpos = matchrec;
|
||
}
|
||
}
|
||
if( i == 1 )
|
||
{
|
||
orgblk += rowstride-1;
|
||
matchrec.x -= 1;
|
||
matchrec.y += 1;
|
||
}
|
||
else
|
||
{
|
||
orgblk += 1;
|
||
matchrec.x += 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
minpos.weight = (uint16_t)intmin(255*255, dmin);
|
||
*best_so_far = minpos;
|
||
|
||
}
|
||
#endif
|
||
|
||
|
||
/*
|
||
* sum absolute difference between two (16*h) blocks Four variations
|
||
* depending on the required half pel interpolation of blk1 (hx,hy)
|
||
*
|
||
* blk1,blk2: addresses of top left pels of both blocks
|
||
* rowstride: distance (in bytes) of vertically adjacent pels
|
||
* hx,hy: flags for horizontal and/or vertical interpolation
|
||
* h: height of block (usually 8 or 16)
|
||
* distlim: bail out if sum exceeds this value
|
||
*
|
||
**/
|
||
|
||
|
||
static int sad_00(uint8_t *blk1,uint8_t *blk2,
|
||
int rowstride, int h,int distlim)
|
||
{
|
||
uint8_t *p1,*p2;
|
||
int j;
|
||
int s;
|
||
register int v;
|
||
|
||
s = 0;
|
||
p1 = blk1;
|
||
p2 = blk2;
|
||
for (j=0; j<h; j++)
|
||
{
|
||
|
||
#define pipestep(o) v = p1[o]-p2[o]; s+= abs(v);
|
||
pipestep(0); pipestep(1); pipestep(2); pipestep(3);
|
||
pipestep(4); pipestep(5); pipestep(6); pipestep(7);
|
||
pipestep(8); pipestep(9); pipestep(10); pipestep(11);
|
||
pipestep(12); pipestep(13); pipestep(14); pipestep(15);
|
||
#undef pipestep
|
||
|
||
if (s >= distlim)
|
||
break;
|
||
|
||
p1+= rowstride;
|
||
p2+= rowstride;
|
||
}
|
||
return s;
|
||
}
|
||
|
||
static int sad_01(uint8_t *blk1,uint8_t *blk2,int rowstride, int h)
|
||
{
|
||
uint8_t *p1,*p2;
|
||
int i,j;
|
||
int s;
|
||
register int v;
|
||
|
||
s = 0;
|
||
p1 = blk1;
|
||
p2 = blk2;
|
||
for (j=0; j<h; j++)
|
||
{
|
||
for (i=0; i<16; i++)
|
||
{
|
||
|
||
v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
|
||
s+=intabs(v);
|
||
}
|
||
p1+= rowstride;
|
||
p2+= rowstride;
|
||
}
|
||
return s;
|
||
}
|
||
|
||
static int sad_10(uint8_t *blk1,uint8_t *blk2, int rowstride, int h)
|
||
{
|
||
uint8_t *p1,*p1a,*p2;
|
||
int i,j;
|
||
int s;
|
||
register int v;
|
||
|
||
s = 0;
|
||
p1 = blk1;
|
||
p2 = blk2;
|
||
p1a = p1 + rowstride;
|
||
for (j=0; j<h; j++)
|
||
{
|
||
for (i=0; i<16; i++)
|
||
{
|
||
v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
|
||
s+= intabs(v);
|
||
}
|
||
p1 = p1a;
|
||
p1a+= rowstride;
|
||
p2+= rowstride;
|
||
}
|
||
|
||
return s;
|
||
}
|
||
|
||
static int sad_11(uint8_t *blk1,uint8_t *blk2, int rowstride, int h)
|
||
{
|
||
uint8_t *p1,*p1a,*p2;
|
||
int i,j;
|
||
int s;
|
||
register int v;
|
||
|
||
s = 0;
|
||
p1 = blk1;
|
||
p2 = blk2;
|
||
p1a = p1 + rowstride;
|
||
|
||
for (j=0; j<h; j++)
|
||
{
|
||
for (i=0; i<16; i++)
|
||
{
|
||
v = ((unsigned int)((p1[i]+p1[i+1])+(p1a[i]+p1a[i+1])+2)>>2) - p2[i];
|
||
s+=intabs(v);
|
||
}
|
||
p1 = p1a;
|
||
p1a+= rowstride;
|
||
p2+= rowstride;
|
||
}
|
||
return s;
|
||
}
|
||
|
||
|
||
/*
|
||
* Compute subsampled images for fast motion compensation search
|
||
* N.b. rowstride should be *two* line widths for interlace images
|
||
*/
|
||
|
||
void subsample_image( uint8_t *image, int rowstride,
|
||
uint8_t *sub22_image,
|
||
uint8_t *sub44_image)
|
||
{
|
||
uint8_t *blk = image;
|
||
uint8_t *b, *nb;
|
||
uint8_t *pb;
|
||
uint8_t *qb;
|
||
uint8_t *start_s22blk, *start_s44blk;
|
||
int i;
|
||
int nextfieldline = rowstride;
|
||
|
||
start_s22blk = sub22_image;
|
||
start_s44blk = sub44_image;
|
||
b = blk;
|
||
nb = (blk+nextfieldline);
|
||
|
||
pb = (uint8_t *) start_s22blk;
|
||
|
||
while( nb < start_s22blk )
|
||
{
|
||
for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */
|
||
{
|
||
pb[0] = ((b[0]+b[1])+(nb[0]+nb[1])+2)>>2;
|
||
pb[1] = ((b[2]+b[3])+(nb[2]+nb[3])+2)>>2;
|
||
pb += 2;
|
||
b += 4;
|
||
nb += 4;
|
||
}
|
||
b += nextfieldline;
|
||
nb = b + nextfieldline;
|
||
}
|
||
|
||
|
||
/* Now create the 4*4 sub-sampled data from the 2*2
|
||
N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the
|
||
original. Albeit half as many lines and pixels...
|
||
*/
|
||
|
||
nextfieldline = nextfieldline >> 1;
|
||
|
||
qb = start_s44blk;
|
||
b = start_s22blk;
|
||
nb = (start_s22blk+nextfieldline);
|
||
|
||
while( nb < start_s44blk )
|
||
{
|
||
for( i = 0; i < nextfieldline/4; ++i )
|
||
{
|
||
qb[0] = ((b[0]+b[1])+(nb[0]+nb[1])+2)>>2;
|
||
qb[1] = ((b[2]+b[3])+(nb[2]+nb[3])+2)>>2;
|
||
qb += 2;
|
||
b += 4;
|
||
nb += 4;
|
||
}
|
||
b += nextfieldline;
|
||
nb = b + nextfieldline;
|
||
}
|
||
|
||
}
|
||
|
||
/*
|
||
* Same as sad_00 except for 2*2 subsampled data so only 8 wide!
|
||
*
|
||
*/
|
||
|
||
|
||
static int sad_sub22( uint8_t *s22blk1, uint8_t *s22blk2,int frowstride,int fh)
|
||
{
|
||
uint8_t *p1 = s22blk1;
|
||
uint8_t *p2 = s22blk2;
|
||
int s = 0;
|
||
int j;
|
||
|
||
for( j = 0; j < fh; ++j )
|
||
{
|
||
register int diff;
|
||
#define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff)
|
||
pipestep(0); pipestep(1);
|
||
pipestep(2); pipestep(3);
|
||
pipestep(4); pipestep(5);
|
||
pipestep(6); pipestep(7);
|
||
p1 += frowstride;
|
||
p2 += frowstride;
|
||
#undef pipestep
|
||
}
|
||
|
||
return s;
|
||
}
|
||
|
||
|
||
|
||
/*
|
||
* Same as sad_00 except for 4*4 sub-sampled data.
|
||
*
|
||
* N.b.: currently assumes only 16*16 or 16*8 motion estimation will
|
||
* be used... I.e. 4*4 or 4*2 sub-sampled blocks will be compared.
|
||
*
|
||
*
|
||
*/
|
||
|
||
|
||
static int sad_sub44( uint8_t *s44blk1, uint8_t *s44blk2,int qrowstride,int qh)
|
||
{
|
||
register uint8_t *p1 = s44blk1;
|
||
register uint8_t *p2 = s44blk2;
|
||
int s = 0;
|
||
register int diff;
|
||
|
||
/* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */
|
||
#define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff;
|
||
pipestep(0); pipestep(1); pipestep(2); pipestep(3);
|
||
if( qh > 1 )
|
||
{
|
||
p1 += qrowstride; p2 += qrowstride;
|
||
pipestep(0); pipestep(1); pipestep(2); pipestep(3);
|
||
if( qh > 2 )
|
||
{
|
||
p1 += qrowstride; p2 += qrowstride;
|
||
pipestep(0); pipestep(1); pipestep(2); pipestep(3);
|
||
p1 += qrowstride; p2 += qrowstride;
|
||
pipestep(0); pipestep(1); pipestep(2); pipestep(3);
|
||
}
|
||
}
|
||
|
||
|
||
return s;
|
||
}
|
||
|
||
/*
|
||
* total squared difference between two (8*h) blocks of 2*2 sub-sampled pels
|
||
* blk1,blk2: addresses of top left pels of both blocks
|
||
* rowstride: distance (in bytes) of vertically adjacent pels
|
||
* h: height of block (usually 8 or 16)
|
||
*/
|
||
|
||
static int sumsq_sub22(uint8_t *blk1, uint8_t *blk2, int rowstride, int h)
|
||
{
|
||
uint8_t *p1 = blk1, *p2 = blk2;
|
||
int i,j,v;
|
||
int s = 0;
|
||
for (j=0; j<h; j++)
|
||
{
|
||
for (i=0; i<8; i++)
|
||
{
|
||
v = p1[i] - p2[i];
|
||
s+= v*v;
|
||
}
|
||
p1+= rowstride;
|
||
p2+= rowstride;
|
||
}
|
||
return s;
|
||
}
|
||
|
||
/* total squared difference between bidirection prediction of (8*h)
|
||
* blocks of 2*2 sub-sampled pels and reference
|
||
* blk1f, blk1b,blk2: addresses of top left
|
||
* pels of blocks
|
||
* rowstride: distance (in bytes) of vertically adjacent
|
||
* pels
|
||
* h: height of block (usually 4 or 8)
|
||
*/
|
||
|
||
static int bsumsq_sub22(uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2,
|
||
int rowstride, int h)
|
||
{
|
||
uint8_t *p1f = blk1f,*p1b = blk1b,*p2 = blk2;
|
||
int i,j,v;
|
||
int s = 0;
|
||
for (j=0; j<h; j++)
|
||
{
|
||
for (i=0; i<8; i++)
|
||
{
|
||
v = ((p1f[i]+p1b[i]+1)>>1) - p2[i];
|
||
s+= v*v;
|
||
}
|
||
p1f+= rowstride;
|
||
p1b+= rowstride;
|
||
p2+= rowstride;
|
||
}
|
||
return s;
|
||
}
|
||
|
||
/*
|
||
* total squared difference between two (16*h) blocks
|
||
* including optional half pel interpolation of blk1 (hx,hy)
|
||
* blk1,blk2: addresses of top left pels of both blocks
|
||
* rowstride: distance (in bytes) of vertically adjacent pels
|
||
* hx,hy: flags for horizontal and/or vertical interpolation
|
||
* h: height of block (usually 8 or 16)
|
||
*/
|
||
|
||
|
||
static int sumsq(blk1,blk2,rowstride,hx,hy,h)
|
||
uint8_t *blk1,*blk2;
|
||
int rowstride,hx,hy,h;
|
||
{
|
||
uint8_t *p1,*p1a,*p2;
|
||
int i,j;
|
||
int s,v;
|
||
|
||
s = 0;
|
||
p1 = blk1;
|
||
p2 = blk2;
|
||
if (!hx && !hy)
|
||
for (j=0; j<h; j++)
|
||
{
|
||
for (i=0; i<16; i++)
|
||
{
|
||
v = p1[i] - p2[i];
|
||
s+= v*v;
|
||
}
|
||
p1+= rowstride;
|
||
p2+= rowstride;
|
||
}
|
||
else if (hx && !hy)
|
||
for (j=0; j<h; j++)
|
||
{
|
||
for (i=0; i<16; i++)
|
||
{
|
||
v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
|
||
s+= v*v;
|
||
}
|
||
p1+= rowstride;
|
||
p2+= rowstride;
|
||
}
|
||
else if (!hx && hy)
|
||
{
|
||
p1a = p1 + rowstride;
|
||
for (j=0; j<h; j++)
|
||
{
|
||
for (i=0; i<16; i++)
|
||
{
|
||
v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
|
||
s+= v*v;
|
||
}
|
||
p1 = p1a;
|
||
p1a+= rowstride;
|
||
p2+= rowstride;
|
||
}
|
||
}
|
||
else /* if (hx && hy) */
|
||
{
|
||
p1a = p1 + rowstride;
|
||
for (j=0; j<h; j++)
|
||
{
|
||
for (i=0; i<16; i++)
|
||
{
|
||
v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
|
||
s+= v*v;
|
||
}
|
||
p1 = p1a;
|
||
p1a+= rowstride;
|
||
p2+= rowstride;
|
||
}
|
||
}
|
||
|
||
return s;
|
||
}
|
||
|
||
|
||
/*
|
||
* absolute difference error between a (16*h) block and a bidirectional
|
||
* prediction
|
||
*
|
||
* p2: address of top left pel of block
|
||
* pf,hxf,hyf: address and half pel flags of forward ref. block
|
||
* pb,hxb,hyb: address and half pel flags of backward ref. block
|
||
* h: height of block
|
||
* rowstride: distance (in bytes) of vertically adjacent pels in p2,pf,pb
|
||
*/
|
||
|
||
|
||
static int bsad(pf,pb,p2,rowstride,hxf,hyf,hxb,hyb,h)
|
||
uint8_t *pf,*pb,*p2;
|
||
int rowstride,hxf,hyf,hxb,hyb,h;
|
||
{
|
||
uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
|
||
int i,j;
|
||
int s,v;
|
||
|
||
pfa = pf + hxf;
|
||
pfb = pf + rowstride*hyf;
|
||
pfc = pfb + hxf;
|
||
|
||
pba = pb + hxb;
|
||
pbb = pb + rowstride*hyb;
|
||
pbc = pbb + hxb;
|
||
|
||
s = 0;
|
||
|
||
for (j=0; j<h; j++)
|
||
{
|
||
for (i=0; i<16; i++)
|
||
{
|
||
v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
|
||
((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
|
||
- *p2++;
|
||
s += abs(v);
|
||
}
|
||
p2+= rowstride-16;
|
||
pf+= rowstride-16;
|
||
pfa+= rowstride-16;
|
||
pfb+= rowstride-16;
|
||
pfc+= rowstride-16;
|
||
pb+= rowstride-16;
|
||
pba+= rowstride-16;
|
||
pbb+= rowstride-16;
|
||
pbc+= rowstride-16;
|
||
}
|
||
|
||
return s;
|
||
}
|
||
|
||
/*
|
||
* squared error between a (16*h) block and a bidirectional
|
||
* prediction
|
||
*
|
||
* p2: address of top left pel of block
|
||
* pf,hxf,hyf: address and half pel flags of forward ref. block
|
||
* pb,hxb,hyb: address and half pel flags of backward ref. block
|
||
* h: height of block
|
||
* rowstride: distance (in bytes) of vertically adjacent pels in p2,pf,pb
|
||
*/
|
||
|
||
|
||
static int bsumsq(pf,pb,p2,rowstride,hxf,hyf,hxb,hyb,h)
|
||
uint8_t *pf,*pb,*p2;
|
||
int rowstride,hxf,hyf,hxb,hyb,h;
|
||
{
|
||
uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
|
||
int i,j;
|
||
int s,v;
|
||
|
||
pfa = pf + hxf;
|
||
pfb = pf + rowstride*hyf;
|
||
pfc = pfb + hxf;
|
||
|
||
pba = pb + hxb;
|
||
pbb = pb + rowstride*hyb;
|
||
pbc = pbb + hxb;
|
||
|
||
s = 0;
|
||
|
||
for (j=0; j<h; j++)
|
||
{
|
||
for (i=0; i<16; i++)
|
||
{
|
||
#define ui(x) ((unsigned int)x)
|
||
v = ((((ui(*pf++) + ui(*pfa++) + ui(*pfb++) + ui(*pfc++) + 2)>>2) +
|
||
((ui(*pb++) + ui(*pba++) + ui(*pbb++) + ui(*pbc++) + 2)>>2)
|
||
+ 1
|
||
)>>1) - ui(*p2++);
|
||
#undef ui
|
||
s+=v*v;
|
||
}
|
||
p2+= rowstride-16;
|
||
pf+= rowstride-16;
|
||
pfa+= rowstride-16;
|
||
pfb+= rowstride-16;
|
||
pfc+= rowstride-16;
|
||
pb+= rowstride-16;
|
||
pba+= rowstride-16;
|
||
pbb+= rowstride-16;
|
||
pbc+= rowstride-16;
|
||
}
|
||
|
||
return s;
|
||
}
|
||
|
||
|
||
/*
|
||
* variance of a (size*size) block, multiplied by 256
|
||
* p: address of top left pel of block
|
||
* rowstride: distance (in bytes) of vertically adjacent pels
|
||
* SIZE is a multiple of 8.
|
||
*/
|
||
static int variance(uint8_t *p, int size, int rowstride)
|
||
{
|
||
int i,j;
|
||
unsigned int v,s,s2;
|
||
int var;
|
||
s = s2 = 0;
|
||
|
||
for (j=0; j<size; j++)
|
||
{
|
||
for (i=0; i<size; i++)
|
||
{
|
||
v = *p++;
|
||
s+= v;
|
||
s2+= v*v;
|
||
}
|
||
p+= rowstride-size;
|
||
}
|
||
var = s2 - (s*s)/(size*size);
|
||
|
||
return var;
|
||
}
|
||
|
||
|
||
|
||
|
||
/*
|
||
* Initialise motion estimation - currently only selection of which
|
||
* versions of the various low level computation routines to use
|
||
*
|
||
*/
|
||
|
||
void init_motion_search(void)
|
||
{
|
||
int cpucap = cpu_accel();
|
||
|
||
if( cpucap == 0 ) /* No MMX/SSE etc support available */
|
||
{
|
||
psad_sub22 = sad_sub22;
|
||
psad_sub44 = sad_sub44;
|
||
psad_00 = sad_00;
|
||
psad_01 = sad_01;
|
||
psad_10 = sad_10;
|
||
psad_11 = sad_11;
|
||
pbsad = bsad;
|
||
pvariance = variance;
|
||
psumsq = sumsq;
|
||
pbsumsq = bsumsq;
|
||
psumsq_sub22 = sumsq_sub22;
|
||
pbsumsq_sub22 = bsumsq_sub22;
|
||
pfind_best_one_pel = find_best_one_pel;
|
||
pbuild_sub22_mests = build_sub22_mests;
|
||
pbuild_sub44_mests = build_sub44_mests;
|
||
}
|
||
#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM)
|
||
else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */
|
||
{
|
||
mjpeg_info( "SETTING EXTENDED MMX for MOTION!");
|
||
psad_sub22 = sad_sub22_mmxe;
|
||
psad_sub44 = sad_sub44_mmxe;
|
||
psad_00 = sad_00_mmxe;
|
||
psad_01 = sad_01_mmxe;
|
||
psad_10 = sad_10_mmxe;
|
||
psad_11 = sad_11_mmxe;
|
||
pbsad = bsad_mmx;
|
||
pvariance = variance_mmx;
|
||
psumsq = sumsq_mmx;
|
||
pbsumsq = bsumsq_mmx;
|
||
psumsq_sub22 = sumsq_sub22_mmx;
|
||
pbsumsq_sub22 = bsumsq_sub22_mmx;
|
||
pfind_best_one_pel = find_best_one_pel_mmxe;
|
||
pbuild_sub22_mests = build_sub22_mests_mmxe;
|
||
pbuild_sub44_mests = build_sub44_mests_mmx;
|
||
pmblocks_sub44_mests = mblocks_sub44_mests_mmxe;
|
||
}
|
||
else if(cpucap & ACCEL_X86_MMX) /* Ordinary MMX CPU */
|
||
{
|
||
mjpeg_info( "SETTING MMX for MOTION!");
|
||
psad_sub22 = sad_sub22_mmx;
|
||
psad_sub44 = sad_sub44_mmx;
|
||
psad_00 = sad_00_mmx;
|
||
psad_01 = sad_01_mmx;
|
||
psad_10 = sad_10_mmx;
|
||
psad_11 = sad_11_mmx;
|
||
pbsad = bsad_mmx;
|
||
pvariance = variance_mmx;
|
||
psumsq = sumsq_mmx;
|
||
pbsumsq = bsumsq_mmx;
|
||
psumsq_sub22 = sumsq_sub22_mmx;
|
||
pbsumsq_sub22 = bsumsq_sub22_mmx;
|
||
pfind_best_one_pel = find_best_one_pel;
|
||
pbuild_sub22_mests = build_sub22_mests;
|
||
pbuild_sub44_mests = build_sub44_mests_mmx;
|
||
pmblocks_sub44_mests = mblocks_sub44_mests_mmx;
|
||
}
|
||
#endif
|
||
|
||
}
|