Files
veejay/veejay-2005/utils/mblock_sad_mmxe.s
Niels Elburg c84f6ca821 moved veejay-current to veejay-2005 (branch closed)
git-svn-id: svn://code.dyne.org/veejay/trunk@509 eb8d1916-c9e9-0310-b8de-cf0c9472ead5
2006-01-20 16:43:25 +00:00

1003 lines
21 KiB
ArmAsm

;;;
;;; mblock_sad_mmxe.s:
;;;
;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
;
; Original MMX sad_* Copyright (C) 2000 Chris Atenasio <chris@crud.net>
; Enhanced MMX and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
;; Yes, I tried prefetch-ing. It makes no difference or makes
;; stuff *slower*.
;
; This program is free software; you can reaxstribute it and/or
; modify it under the terms of the GNU General Public License
; as published by the Free Software Foundation; either version 2
; of the License, or (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, write to the Free Software
; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
;
;
;
SECTION .text
global sad_00_mmxe
; int sad_00(char *blk1,char *blk2,int lx,int h,int distlim);
; distlim unused - costs more to check than the savings of
; aborting the computation early from time to time...
; eax = p1
; ebx = p2
; ecx = rowsleft
; edx = lx;
; mm0 = distance accumulator
; mm1 = temp
; mm2 = temp
; mm3 = temp
; mm4 = temp
; mm5 = temp
; mm6 = temp
align 32
sad_00_mmxe:
push ebp ; save frame pointer
mov ebp, esp ; link
push ebx
push ecx
push edx
pxor mm0, mm0 ; zero acculumator
mov eax, [ebp+8] ; get p1
sad_00_0misalign:
mov ebx, [ebp+12] ; get p2
mov edx, [ebp+16] ; get lx
mov ecx, [ebp+20] ; get rowsleft
jmp nextrow00sse
align 32
nextrow00sse:
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
add eax, edx ; update pointer to next row
paddd mm0, mm4 ; accumulate difference
psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
add ebx, edx ; ditto
paddd mm0, mm5 ; accumulate difference
movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
movq mm4, [eax+8] ; load next 8 bytes of p1 (row 2)
add eax, edx ; update pointer to next row
paddd mm0, mm6 ; accumulate difference
psadbw mm4, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
add ebx, edx ; ditto
paddd mm0, mm4 ; accumulate difference
;psubd mm2, mm3 ; decrease rowsleft
;movq mm5, mm1 ; copy distlim
;pcmpgtd mm5, mm0 ; distlim > dist?
;pand mm2, mm5 ; mask rowsleft with answer
;movd ecx, mm2 ; move rowsleft to ecx
;add eax, edx ; update pointer to next row
;add ebx, edx ; ditto
;test ecx, ecx ; check rowsleft
sub ecx, 2
jnz nextrow00sse
movd eax, mm0 ; store return value
pop edx
pop ecx
pop ebx
pop ebp
emms
ret
global sad_00_Ammxe
;; This is a special version that only does aligned accesses...
;; Wonder if it'll make it faster on a P-III
;; ANSWER: NO its slower hence no longer used.
; int sad_00(char *blk1,char *blk2,int lx,int h,int distlim);
; distlim unused - costs more to check than the savings of
; aborting the computation early from time to time...
; eax = p1
; ebx = p2
; ecx = rowsleft
; edx = lx;
; mm0 = distance accumulator
; mm1 = temp
; mm2 = right shift to adjust for mis-align
; mm3 = left shift to adjust for mis-align
; mm4 = temp
; mm5 = temp
; mm6 = temp
align 32
sad_00_Ammxe:
push ebp ; save frame pointer
mov ebp, esp ; link
push ebx
push ecx
push edx
pxor mm0, mm0 ; zero acculumator
mov eax, [ebp+8] ; get p1
mov ebx, eax
and ebx, 7 ; Misalignment!
cmp ebx, 0
jz near sad_00_0misalign
sub eax, ebx ; Align eax
mov ecx, 8 ; ecx = 8-misalignment
sub ecx, ebx
shl ebx, 3 ; Convert into bit-shifts...
shl ecx, 3
movd mm2, ebx ; mm2 = shift to start msb
movd mm3, ecx ; mm3 = shift to end lsb
mov ebx, [ebp+12] ; get p2
mov edx, [ebp+16] ; get lx
mov ecx, [ebp+20] ; get rowsleft
jmp nextrow00ssea
align 32
nextrow00ssea:
movq mm4, [eax] ; load first 8 bytes of aligned p1 (row 1)
movq mm5, [eax+8] ; load next 8 bytes of aligned p1 (row 1)
movq mm6, mm5
psrlq mm4, mm2 ; mm4 first 8 bytes of p1 proper
psllq mm5, mm3
por mm4, mm5
psadbw mm4, [ebx] ; compare to first 8 bytes of p2
movq mm7, [eax+16] ; load last 8 bytes of aligned p1
add eax, edx ; update pointer to next row
psrlq mm6, mm2 ; mm6 2nd 8 bytes of p1 proper
psllq mm7, mm3
por mm6, mm7
paddd mm0, mm4 ; accumulate difference
psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
add ebx, edx ; ditto
paddd mm0, mm6 ; accumulate difference
sub ecx, 1
jnz nextrow00ssea
movd eax, mm0 ; store return value
pop edx
pop ecx
pop ebx
pop ebp
emms
ret
global sad_01_mmxe
; int sad_01(char *blk1,char *blk2,int lx,int h);
; eax = p1
; ebx = p2
; ecx = counter temp
; edx = lx;
; mm0 = distance accumulator
; mm1 = distlim
; mm2 = rowsleft
; mm3 = 2 (rows per loop)
; mm4 = temp
; mm5 = temp
; mm6 = temp
align 32
sad_01_mmxe:
push ebp
mov ebp, esp
push ebx
push ecx
push edx
pxor mm0, mm0 ; zero acculumator
mov eax, [ebp+8] ; get p1
mov ebx, [ebp+12] ; get p2
mov edx, [ebp+16] ; get lx
mov ecx, [ebp+20] ; get rowsleft
jmp nextrow01 ; snap to it
align 32
nextrow01:
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
pavgb mm4, [eax+1] ; Interpolate...
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
paddd mm0, mm4 ; accumulate difference
movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
pavgb mm5, [eax+9] ; Interpolate
psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
paddd mm0, mm5 ; accumulate difference
add eax, edx ; update pointer to next row
add ebx, edx ; ditto
movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
pavgb mm6, [eax+1] ; Interpolate
psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
paddd mm0, mm6 ; accumulate difference
movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
pavgb mm7, [eax+9]
psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
paddd mm0, mm7 ; accumulate difference
add eax, edx ; update pointer to next row
add ebx, edx ; ditto
sub ecx, 2 ; check rowsleft
jnz nextrow01 ; rinse and repeat
movd eax, mm0 ; store return value
pop edx
pop ecx
pop ebx
pop ebp ; restore stack pointer
emms ; clear mmx registers
ret ; we now return you to your regular programming
global sad_10_mmxe
; int sad_10(char *blk1,char *blk2,int lx,int h);
; eax = p1
; ebx = p2
; ecx = counter temp
; edx = lx;
; edi = p1+lx
; mm0 = distance accumulator
; mm2 = rowsleft
; mm3 = 2 (rows per loop)
; mm4 = temp
; mm5 = temp
; mm6 = temp
align 32
sad_10_mmxe:
push ebp ; save stack pointer
mov ebp, esp
push ebx
push ecx
push edx
push edi
pxor mm0, mm0 ; zero acculumator
mov eax, [ebp+8] ; get p1
mov ebx, [ebp+12] ; get p2
mov edx, [ebp+16] ; get lx
mov edi, eax
add edi, edx
mov ecx, [ebp+20] ; get rowsleft
jmp nextrow10 ; snap to it
align 32
nextrow10:
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
pavgb mm4, [edi] ; Interpolate...
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
paddd mm0, mm4 ; accumulate difference
movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
pavgb mm5, [edi+8] ; Interpolate
psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
paddd mm0, mm5 ; accumulate difference
add eax, edx ; update pointer to next row
add ebx, edx ; ditto
add edi, edx
movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
pavgb mm6, [edi] ; Interpolate
psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
paddd mm0, mm6 ; accumulate difference
movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
pavgb mm7, [edi+8]
psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
paddd mm0, mm7 ; accumulate difference
psubd mm2, mm3 ; decrease rowsleft
add eax, edx ; update pointer to next row
add ebx, edx ; ditto
add edi, edx
sub ecx, 2 ; check rowsleft (we're doing 2 at a time)
jnz nextrow10 ; rinse and repeat
movd eax, mm0 ; store return value
pop edi
pop edx
pop ecx
pop ebx
pop ebp ; restore stack pointer
emms ; clear mmx registers
ret ; we now return you to your regular programming
global sad_11_mmxe
; int sad_11(char *blk1,char *blk2,int lx,int h);
; eax = p1
; ebx = p2
; ecx = counter temp
; edx = lx;
; edi = p1+lx
; mm0 = distance accumulator
; mm2 = rowsleft
; mm3 = 2 (rows per loop)
; mm4 = temp
; mm5 = temp
; mm6 = temp
align 32
sad_11_mmxe:
push ebp ; save stack pointer
mov ebp, esp ; so that we can do this
push ebx ; save the pigs
push ecx ; make them squeal
push edx ; lets have pigs for every meal
push edi
pxor mm0, mm0 ; zero acculumator
mov eax, [ebp+8] ; get p1
mov ebx, [ebp+12] ; get p2
mov edx, [ebp+16] ; get lx
mov edi, eax
add edi, edx
mov ecx, [ebp+20] ; get rowsleft
jmp nextrow11 ; snap to it
align 32
nextrow11:
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
pavgb mm4, [edi] ; Interpolate...
movq mm5, [eax+1]
pavgb mm5, [edi+1]
pavgb mm4, mm5
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
paddd mm0, mm4 ; accumulate difference
movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1)
pavgb mm6, [edi+8] ; Interpolate
movq mm7, [eax+9]
pavgb mm7, [edi+9]
pavgb mm6, mm7
psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
paddd mm0, mm6 ; accumulate difference
add eax, edx ; update pointer to next row
add ebx, edx ; ditto
add edi, edx
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
pavgb mm4, [edi] ; Interpolate...
movq mm5, [eax+1]
pavgb mm5, [edi+1]
pavgb mm4, mm5
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
paddd mm0, mm4 ; accumulate difference
movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1)
pavgb mm6, [edi+8] ; Interpolate
movq mm7, [eax+9]
pavgb mm7, [edi+9]
pavgb mm6, mm7
psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
paddd mm0, mm6 ; accumulate difference
add eax, edx ; update pointer to next row
add ebx, edx ; ditto
add edi, edx
sub ecx, 2 ; check rowsleft
jnz near nextrow11 ; rinse and repeat
movd eax, mm0 ; store return value
pop edi
pop edx
pop ecx
pop ebx
pop ebp ; restore stack pointer
emms ; clear mmx registers
ret ; we now return you to your regular programming
global sad_sub22_mmxe
; int sad_sub22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh);
; eax = p1
; ebx = p2
; ecx = counter temp
; edx = flx;
; mm0 = distance accumulator
; mm2 = rowsleft
; mm3 = 2 (rows per loop)
; mm4 = temp
; mm5 = temp
; mm6 = temp
align 32
sad_sub22_mmxe:
push ebp ; save frame pointer
mov ebp, esp
push ebx
push ecx
push edx
pxor mm0, mm0 ; zero acculumator
mov eax, [ebp+8] ; get p1
mov ebx, [ebp+12] ; get p2
mov edx, [ebp+16] ; get lx
mov ecx, [ebp+20]
jmp nextrowfd
align 32
nextrowfd:
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
add eax, edx ; update pointer to next row
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
add ebx, edx ; ditto
paddd mm0, mm4 ; accumulate difference
movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
add eax, edx ; update pointer to next row
psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
add ebx, edx ; ditto
paddd mm0, mm6 ; accumulate difference
sub ecx, 2
jnz nextrowfd
movd eax, mm0
pop edx
pop ecx
pop ebx
pop ebp
emms
ret
global sad_sub44_mmxe
; int sad_sub44_mmxe(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
; eax = p1
; ebx = p2
; ecx = temp
; edx = qlx;
; esi = rowsleft
; mm0 = distance accumulator left block p1
; mm1 = distance accumulator right block p1
; mm2 = 0
; mm3 = 0
; mm4 = temp
; mm5 = temp
; mm6 = temp
align 32
sad_sub44_mmxe:
push ebp
mov ebp, esp
push ebx
push ecx
push edx
push esi
pxor mm0, mm0 ; zero acculumator
pxor mm1, mm1
pxor mm2, mm2
mov eax, [ebp+8] ; get p1
mov ebx, [ebp+12] ; get p2
mov edx, [ebp+16] ; get qlx
mov esi, [ebp+20] ; get rowsleft
jmp nextrowqd ; snap to it
align 32
nextrowqd:
movq mm4, [eax] ; load 8 bytes of p1 (two blocks!)
add eax, edx ; update pointer to next row
movq mm6, mm4 ;
mov ecx, [ebx] ; load 4 bytes of p2
punpcklbw mm4, mm2 ; mm4 = bytes 0..3 p1 (spaced out)
movd mm5, ecx
punpcklbw mm5, mm2 ; mm5 = bytes 0..3 p2 (spaced out)
psadbw mm4, mm5 ; compare to left block
add ebx, edx ; ditto
; punpckhbw mm6, mm2 ; mm6 = bytes 4..7 p1 (spaced out)
paddd mm0, mm4 ; accumulate difference left block
; psadbw mm6,mm5 ; compare to right block
; paddd mm1, mm6 ; accumulate difference right block
sub esi, 1
jnz nextrowqd
movd eax, mm0
; movd ebx, mm1
; sal ebx, 16
; or eax, ebx
pop esi
pop edx
pop ecx
pop ebx
pop ebp ; restore stack pointer
emms ; clear mmx registers
ret ; we now return you to your regular programming
;;;
;;; mblock_*nearest4_sad_mmxe.s:
;;;
;;; Enhanced MMX optimized Sum Absolute Differences routines for
;;; quads macroblocks offset by (0,0) (0,1) (1,0) (1,1) pel
;;;
;;; Explanation: the motion compensation search at 1-pel and 2*2 sub-sampled
;;; evaluates macroblock quads. A lot of memory accesses can be saved
;;; if each quad is done together rather than each macroblock in the
;;; quad handled individually.
;;; TODO: Really there ought to be MMX versions and the function's
;;; specification should be documented...
;
; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
;;; CURRENTLY not used but used in testing as reference for tweaks...
global mblockq_sad_REF
; void mblockq_sad_REF(char *blk1,char *blk2,int lx,int h,int *weightvec);
; eax = p1
; ebx = p2
; ecx = unused
; edx = lx;
; edi = rowsleft
; esi = h
; mm0 = SAD (x+0,y+0)
; mm1 = SAD (x+2,y+0)
; mm2 = SAD (x+0,y+2)
; mm3 = SAD (x+2,y+2)
; mm4 = temp
; mm5 = temp
; mm6 = temp
; mm7 = temp
align 32
mblockq_sad_REF:
push ebp ; save frame pointer
mov ebp, esp ; link
push eax
push ebx
push ecx
push edx
push edi
push esi
pxor mm0, mm0 ; zero accumulators
pxor mm1, mm1
pxor mm2, mm2
pxor mm3, mm3
mov eax, [ebp+8] ; get p1
mov ebx, [ebp+12] ; get p2
mov edx, [ebp+16] ; get lx
mov edi, [ebp+20] ; get rowsleft
mov esi, edi
jmp nextrow_block_d1
align 32
nextrow_block_d1:
;; Do the (+0,+0) SAD
movq mm4, [eax] ; load 1st 8 bytes of p1
movq mm6, mm4
movq mm5, [ebx]
psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
paddd mm0, mm4 ; accumulate difference
movq mm4, [eax+8] ; load 2nd 8 bytes of p1
movq mm7, mm4
psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
paddd mm0, mm4 ; accumulate difference
cmp edi, esi
jz firstrow0
;; Do the (0,+2) SAD
sub ebx, edx
psadbw mm6, [ebx] ; compare to next 8 bytes of p2 (row 1)
paddd mm2, mm6 ; accumulate difference
psadbw mm7, [ebx+8] ; next 8 bytes of p1 (row 1)
add ebx, edx
paddd mm2, mm7
firstrow0:
;; Do the (+2,0) SAD
movq mm4, [eax+1]
movq mm6, mm4
psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
paddd mm1, mm4 ; accumulate difference
movq mm4, [eax+9]
movq mm7, mm4
psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
paddd mm1, mm4 ; accumulate difference
cmp edi, esi
jz firstrow1
;; Do the (+2, +2 ) SAD
sub ebx, edx
psadbw mm6, [ebx] ; compare to 1st 8 bytes of prev p2
psadbw mm7, [ebx+8] ; 2nd 8 bytes of prev p2
add ebx, edx
paddd mm3, mm6 ; accumulate difference
paddd mm3, mm7
firstrow1:
add eax, edx ; update pointer to next row
add ebx, edx ; ditto
sub edi, 1
jnz near nextrow_block_d1
;; Do the last row of the (0,+2) SAD
movq mm4, [eax] ; load 1st 8 bytes of p1
movq mm5, [eax+8] ; load 2nd 8 bytes of p1
sub ebx, edx
psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1)
psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1)
paddd mm2, mm4 ; accumulate difference
paddd mm2, mm5
movq mm4, [eax+1]
movq mm5, [eax+9]
;; Do the last row of rhw (+2, +2) SAD
psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2
psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2
paddd mm3, mm4 ; accumulate difference
paddd mm3, mm5
mov eax, [ebp+24] ; Weightvec
movd [eax+0], mm0
movd [eax+4], mm1
movd [eax+8], mm2
movd [eax+12], mm3
pop esi
pop edi
pop edx
pop ecx
pop ebx
pop eax
pop ebp
emms
ret
global mblock_nearest4_sads_mmxe
; void mblock_nearest4_sads_mmxe(char *blk1,char *blk2,int lx,int h,int *weightvec);
; eax = p1
; ebx = p2
; ecx = unused
; edx = lx;
; edi = rowsleft
; esi = h
; mm0 = SAD (x+0,y+0),SAD (x+0,y+2)
; mm1 = SAD (x+2,y+0),SAD (x+2,y+2)
; mm4 = temp
; mm5 = temp
; mm6 = temp
; mm7 = temp
align 32
mblock_nearest4_sads_mmxe:
push ebp ; save frame pointer
mov ebp, esp ; link
push eax
push ebx
push ecx
push edx
push edi
push esi
mov eax, [ebp+8] ; get p1
prefetcht0 [eax]
pxor mm0, mm0 ; zero accumulators
pxor mm1, mm1
mov ebx, [ebp+12] ; get p2
mov edx, [ebp+16] ; get lx
mov edi, [ebp+20] ; get rowsleft
mov esi, edi
jmp nextrow_block_e1
align 32
nextrow_block_e1:
;; Do the (+0,+0) SAD
prefetcht0 [eax+edx]
movq mm4, [eax] ; load 1st 8 bytes of p1
movq mm6, mm4
movq mm5, [ebx]
psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
paddd mm0, mm4 ; accumulate difference
movq mm4, [eax+8] ; load 2nd 8 bytes of p1
movq mm7, mm4
psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
paddd mm0, mm4 ; accumulate difference
cmp edi, esi
jz firstrowe0
;; Do the (0,+2) SAD
sub ebx, edx
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
movq mm2, [ebx]
psadbw mm6, mm2 ; compare to next 8 bytes of p2 (row 1)
paddd mm0, mm6 ; accumulate difference
movq mm3, [ebx+8]
psadbw mm7, mm3 ; next 8 bytes of p1 (row 1)
add ebx, edx
paddd mm0, mm7
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
firstrowe0:
;; Do the (+2,0) SAD
movq mm4, [eax+1]
movq mm6, mm4
psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
paddd mm1, mm4 ; accumulate difference
movq mm4, [eax+9]
movq mm7, mm4
psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
paddd mm1, mm4 ; accumulate difference
cmp edi, esi
jz firstrowe1
;; Do the (+2, +2 ) SAD
sub ebx, edx
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
psadbw mm6, mm2 ; compare to 1st 8 bytes of prev p2
psadbw mm7, mm3 ; 2nd 8 bytes of prev p2
add ebx, edx
paddd mm1, mm6 ; accumulate difference
paddd mm1, mm7
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
firstrowe1:
add eax, edx ; update pointer to next row
add ebx, edx ; ditto
sub edi, 1
jnz near nextrow_block_e1
;; Do the last row of the (0,+2) SAD
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
movq mm4, [eax] ; load 1st 8 bytes of p1
movq mm5, [eax+8] ; load 2nd 8 bytes of p1
sub ebx, edx
psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1)
psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1)
paddd mm0, mm4 ; accumulate difference
paddd mm0, mm5
;; Do the last row of rhw (+2, +2) SAD
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
movq mm4, [eax+1]
movq mm5, [eax+9]
psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2
psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2
paddd mm1, mm4 ; accumulate difference
paddd mm1, mm5
mov eax, [ebp+24] ; Weightvec
movd [eax+8], mm0
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
movd [eax+12], mm1
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
movd [eax+0], mm0
movd [eax+4], mm1
pop esi
pop edi
pop edx
pop ecx
pop ebx
pop eax
pop ebp
emms
ret
global mblock_sub22_nearest4_sads_mmxe
; void mblock_sub22_nearest4_sads_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh, int* resvec);
; eax = p1
; ebx = p2
; ecx = counter temp
; edx = flx;
; mm0 = distance accumulator
; mm1 = distance accumulator
; mm2 = previous p1 row
; mm3 = previous p1 displaced by 1 byte...
; mm4 = temp
; mm5 = temp
; mm6 = temp
; mm7 = temp / 0 if first row 0xff otherwise
align 32
mblock_sub22_nearest4_sads_mmxe:
push ebp ; save frame pointer
mov ebp, esp
push eax
push ebx
push ecx
push edx
pxor mm0, mm0 ; zero acculumator
pxor mm1, mm1 ; zero acculumator
pxor mm2, mm2 ; zero acculumator
pxor mm3, mm3 ; zero acculumator
mov eax, [ebp+8] ; get p1
mov ebx, [ebp+12] ; get p2
mov edx, [ebp+16] ; get lx
mov ecx, [ebp+20]
movq mm2, [eax+edx]
movq mm3, [eax+edx+1]
jmp nextrowbd22
align 32
nextrowbd22:
movq mm5, [ebx] ; load previous row reference block
; mm2 /mm3 containts current row target block
psadbw mm2, mm5 ; Comparse (x+0,y+2)
paddd mm1, mm2
psadbw mm3, mm5 ; Compare (x+2,y+2)
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
paddd mm1, mm3
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
movq mm2, [eax] ; Load current row traget block into mm2 / mm3
movq mm6, mm2
movq mm3, [eax+1]
sub eax, edx
sub ebx, edx
prefetcht0 [eax]
movq mm7, mm3
psadbw mm6, mm5 ; Compare (x+0,y+0)
paddd mm0, mm6
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
psadbw mm7, mm5 ; Compare (x+2,y+0)
paddd mm0, mm7
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
sub ecx, 1
jnz nextrowbd22
mov eax, [ebp+24]
movq [eax+0], mm0
movq [eax+8], mm1
pop edx
pop ecx
pop ebx
pop eax
pop ebp
emms
ret