mirror of
https://github.com/game-stop/veejay.git
synced 2025-12-20 14:50:01 +01:00
1003 lines
21 KiB
ArmAsm
1003 lines
21 KiB
ArmAsm
;;;
|
|
;;; mblock_sad_mmxe.s:
|
|
;;;
|
|
;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
|
|
;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
|
|
;
|
|
; Original MMX sad_* Copyright (C) 2000 Chris Atenasio <chris@crud.net>
|
|
; Enhanced MMX and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
|
|
|
|
;; Yes, I tried prefetch-ing. It makes no difference or makes
|
|
;; stuff *slower*.
|
|
|
|
;
|
|
; This program is free software; you can reaxstribute it and/or
|
|
; modify it under the terms of the GNU General Public License
|
|
; as published by the Free Software Foundation; either version 2
|
|
; of the License, or (at your option) any later version.
|
|
;
|
|
; This program is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
; GNU General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU General Public License
|
|
; along with this program; if not, write to the Free Software
|
|
; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
;
|
|
;
|
|
;
|
|
|
|
SECTION .text
|
|
|
|
global sad_00_mmxe
|
|
|
|
; int sad_00(char *blk1,char *blk2,int lx,int h,int distlim);
|
|
; distlim unused - costs more to check than the savings of
|
|
; aborting the computation early from time to time...
|
|
; eax = p1
|
|
; ebx = p2
|
|
; ecx = rowsleft
|
|
; edx = lx;
|
|
|
|
; mm0 = distance accumulator
|
|
; mm1 = temp
|
|
; mm2 = temp
|
|
; mm3 = temp
|
|
; mm4 = temp
|
|
; mm5 = temp
|
|
; mm6 = temp
|
|
|
|
|
|
align 32
|
|
sad_00_mmxe:
|
|
push ebp ; save frame pointer
|
|
mov ebp, esp ; link
|
|
|
|
push ebx
|
|
push ecx
|
|
push edx
|
|
|
|
pxor mm0, mm0 ; zero acculumator
|
|
|
|
mov eax, [ebp+8] ; get p1
|
|
sad_00_0misalign:
|
|
mov ebx, [ebp+12] ; get p2
|
|
mov edx, [ebp+16] ; get lx
|
|
|
|
mov ecx, [ebp+20] ; get rowsleft
|
|
jmp nextrow00sse
|
|
align 32
|
|
nextrow00sse:
|
|
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
|
|
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
|
|
movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
|
|
add eax, edx ; update pointer to next row
|
|
paddd mm0, mm4 ; accumulate difference
|
|
|
|
psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
|
|
add ebx, edx ; ditto
|
|
paddd mm0, mm5 ; accumulate difference
|
|
|
|
|
|
movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
|
|
psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
|
|
movq mm4, [eax+8] ; load next 8 bytes of p1 (row 2)
|
|
add eax, edx ; update pointer to next row
|
|
paddd mm0, mm6 ; accumulate difference
|
|
|
|
psadbw mm4, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
|
|
add ebx, edx ; ditto
|
|
paddd mm0, mm4 ; accumulate difference
|
|
|
|
;psubd mm2, mm3 ; decrease rowsleft
|
|
;movq mm5, mm1 ; copy distlim
|
|
;pcmpgtd mm5, mm0 ; distlim > dist?
|
|
;pand mm2, mm5 ; mask rowsleft with answer
|
|
;movd ecx, mm2 ; move rowsleft to ecx
|
|
|
|
;add eax, edx ; update pointer to next row
|
|
;add ebx, edx ; ditto
|
|
|
|
;test ecx, ecx ; check rowsleft
|
|
sub ecx, 2
|
|
jnz nextrow00sse
|
|
|
|
movd eax, mm0 ; store return value
|
|
|
|
pop edx
|
|
pop ecx
|
|
pop ebx
|
|
|
|
pop ebp
|
|
emms
|
|
ret
|
|
|
|
|
|
|
|
|
|
global sad_00_Ammxe
|
|
;; This is a special version that only does aligned accesses...
|
|
;; Wonder if it'll make it faster on a P-III
|
|
;; ANSWER: NO its slower hence no longer used.
|
|
|
|
; int sad_00(char *blk1,char *blk2,int lx,int h,int distlim);
|
|
; distlim unused - costs more to check than the savings of
|
|
; aborting the computation early from time to time...
|
|
; eax = p1
|
|
; ebx = p2
|
|
; ecx = rowsleft
|
|
; edx = lx;
|
|
|
|
; mm0 = distance accumulator
|
|
; mm1 = temp
|
|
; mm2 = right shift to adjust for mis-align
|
|
; mm3 = left shift to adjust for mis-align
|
|
; mm4 = temp
|
|
; mm5 = temp
|
|
; mm6 = temp
|
|
|
|
|
|
align 32
|
|
sad_00_Ammxe:
|
|
push ebp ; save frame pointer
|
|
mov ebp, esp ; link
|
|
|
|
push ebx
|
|
push ecx
|
|
push edx
|
|
|
|
pxor mm0, mm0 ; zero acculumator
|
|
|
|
mov eax, [ebp+8] ; get p1
|
|
mov ebx, eax
|
|
and ebx, 7 ; Misalignment!
|
|
cmp ebx, 0
|
|
jz near sad_00_0misalign
|
|
sub eax, ebx ; Align eax
|
|
mov ecx, 8 ; ecx = 8-misalignment
|
|
sub ecx, ebx
|
|
shl ebx, 3 ; Convert into bit-shifts...
|
|
shl ecx, 3
|
|
movd mm2, ebx ; mm2 = shift to start msb
|
|
movd mm3, ecx ; mm3 = shift to end lsb
|
|
|
|
mov ebx, [ebp+12] ; get p2
|
|
mov edx, [ebp+16] ; get lx
|
|
mov ecx, [ebp+20] ; get rowsleft
|
|
jmp nextrow00ssea
|
|
align 32
|
|
nextrow00ssea:
|
|
movq mm4, [eax] ; load first 8 bytes of aligned p1 (row 1)
|
|
movq mm5, [eax+8] ; load next 8 bytes of aligned p1 (row 1)
|
|
movq mm6, mm5
|
|
psrlq mm4, mm2 ; mm4 first 8 bytes of p1 proper
|
|
psllq mm5, mm3
|
|
por mm4, mm5
|
|
psadbw mm4, [ebx] ; compare to first 8 bytes of p2
|
|
|
|
movq mm7, [eax+16] ; load last 8 bytes of aligned p1
|
|
add eax, edx ; update pointer to next row
|
|
psrlq mm6, mm2 ; mm6 2nd 8 bytes of p1 proper
|
|
psllq mm7, mm3
|
|
por mm6, mm7
|
|
|
|
|
|
paddd mm0, mm4 ; accumulate difference
|
|
|
|
psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
|
|
add ebx, edx ; ditto
|
|
paddd mm0, mm6 ; accumulate difference
|
|
|
|
sub ecx, 1
|
|
jnz nextrow00ssea
|
|
|
|
movd eax, mm0 ; store return value
|
|
|
|
pop edx
|
|
pop ecx
|
|
pop ebx
|
|
|
|
pop ebp
|
|
emms
|
|
ret
|
|
|
|
|
|
global sad_01_mmxe
|
|
|
|
; int sad_01(char *blk1,char *blk2,int lx,int h);
|
|
|
|
; eax = p1
|
|
; ebx = p2
|
|
; ecx = counter temp
|
|
; edx = lx;
|
|
|
|
; mm0 = distance accumulator
|
|
; mm1 = distlim
|
|
; mm2 = rowsleft
|
|
; mm3 = 2 (rows per loop)
|
|
; mm4 = temp
|
|
; mm5 = temp
|
|
; mm6 = temp
|
|
|
|
|
|
align 32
|
|
sad_01_mmxe:
|
|
push ebp
|
|
mov ebp, esp
|
|
|
|
push ebx
|
|
push ecx
|
|
push edx
|
|
|
|
pxor mm0, mm0 ; zero acculumator
|
|
|
|
mov eax, [ebp+8] ; get p1
|
|
mov ebx, [ebp+12] ; get p2
|
|
mov edx, [ebp+16] ; get lx
|
|
|
|
mov ecx, [ebp+20] ; get rowsleft
|
|
jmp nextrow01 ; snap to it
|
|
align 32
|
|
nextrow01:
|
|
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
|
|
pavgb mm4, [eax+1] ; Interpolate...
|
|
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
|
|
paddd mm0, mm4 ; accumulate difference
|
|
|
|
movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
|
|
pavgb mm5, [eax+9] ; Interpolate
|
|
psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
|
|
paddd mm0, mm5 ; accumulate difference
|
|
|
|
add eax, edx ; update pointer to next row
|
|
add ebx, edx ; ditto
|
|
|
|
movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
|
|
pavgb mm6, [eax+1] ; Interpolate
|
|
psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
|
|
paddd mm0, mm6 ; accumulate difference
|
|
|
|
movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
|
|
pavgb mm7, [eax+9]
|
|
psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
|
|
paddd mm0, mm7 ; accumulate difference
|
|
|
|
add eax, edx ; update pointer to next row
|
|
add ebx, edx ; ditto
|
|
|
|
sub ecx, 2 ; check rowsleft
|
|
jnz nextrow01 ; rinse and repeat
|
|
|
|
movd eax, mm0 ; store return value
|
|
|
|
pop edx
|
|
pop ecx
|
|
pop ebx
|
|
|
|
pop ebp ; restore stack pointer
|
|
|
|
emms ; clear mmx registers
|
|
ret ; we now return you to your regular programming
|
|
|
|
|
|
global sad_10_mmxe
|
|
|
|
; int sad_10(char *blk1,char *blk2,int lx,int h);
|
|
|
|
; eax = p1
|
|
; ebx = p2
|
|
; ecx = counter temp
|
|
; edx = lx;
|
|
; edi = p1+lx
|
|
|
|
; mm0 = distance accumulator
|
|
; mm2 = rowsleft
|
|
; mm3 = 2 (rows per loop)
|
|
; mm4 = temp
|
|
; mm5 = temp
|
|
; mm6 = temp
|
|
|
|
|
|
align 32
|
|
sad_10_mmxe:
|
|
push ebp ; save stack pointer
|
|
mov ebp, esp
|
|
|
|
push ebx
|
|
push ecx
|
|
push edx
|
|
push edi
|
|
|
|
pxor mm0, mm0 ; zero acculumator
|
|
|
|
mov eax, [ebp+8] ; get p1
|
|
mov ebx, [ebp+12] ; get p2
|
|
mov edx, [ebp+16] ; get lx
|
|
mov edi, eax
|
|
add edi, edx
|
|
mov ecx, [ebp+20] ; get rowsleft
|
|
jmp nextrow10 ; snap to it
|
|
align 32
|
|
nextrow10:
|
|
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
|
|
pavgb mm4, [edi] ; Interpolate...
|
|
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
|
|
paddd mm0, mm4 ; accumulate difference
|
|
|
|
movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
|
|
pavgb mm5, [edi+8] ; Interpolate
|
|
psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
|
|
paddd mm0, mm5 ; accumulate difference
|
|
|
|
add eax, edx ; update pointer to next row
|
|
add ebx, edx ; ditto
|
|
add edi, edx
|
|
|
|
movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
|
|
pavgb mm6, [edi] ; Interpolate
|
|
psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
|
|
paddd mm0, mm6 ; accumulate difference
|
|
|
|
movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
|
|
pavgb mm7, [edi+8]
|
|
psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
|
|
paddd mm0, mm7 ; accumulate difference
|
|
|
|
psubd mm2, mm3 ; decrease rowsleft
|
|
|
|
add eax, edx ; update pointer to next row
|
|
add ebx, edx ; ditto
|
|
add edi, edx
|
|
|
|
sub ecx, 2 ; check rowsleft (we're doing 2 at a time)
|
|
jnz nextrow10 ; rinse and repeat
|
|
|
|
movd eax, mm0 ; store return value
|
|
|
|
pop edi
|
|
pop edx
|
|
pop ecx
|
|
pop ebx
|
|
|
|
pop ebp ; restore stack pointer
|
|
|
|
emms ; clear mmx registers
|
|
ret ; we now return you to your regular programming
|
|
|
|
|
|
global sad_11_mmxe
|
|
|
|
; int sad_11(char *blk1,char *blk2,int lx,int h);
|
|
|
|
; eax = p1
|
|
; ebx = p2
|
|
; ecx = counter temp
|
|
; edx = lx;
|
|
; edi = p1+lx
|
|
|
|
|
|
; mm0 = distance accumulator
|
|
; mm2 = rowsleft
|
|
; mm3 = 2 (rows per loop)
|
|
; mm4 = temp
|
|
; mm5 = temp
|
|
; mm6 = temp
|
|
|
|
|
|
align 32
|
|
sad_11_mmxe:
|
|
push ebp ; save stack pointer
|
|
mov ebp, esp ; so that we can do this
|
|
|
|
push ebx ; save the pigs
|
|
push ecx ; make them squeal
|
|
push edx ; lets have pigs for every meal
|
|
push edi
|
|
|
|
pxor mm0, mm0 ; zero acculumator
|
|
|
|
mov eax, [ebp+8] ; get p1
|
|
mov ebx, [ebp+12] ; get p2
|
|
mov edx, [ebp+16] ; get lx
|
|
mov edi, eax
|
|
add edi, edx
|
|
mov ecx, [ebp+20] ; get rowsleft
|
|
jmp nextrow11 ; snap to it
|
|
align 32
|
|
nextrow11:
|
|
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
|
|
pavgb mm4, [edi] ; Interpolate...
|
|
movq mm5, [eax+1]
|
|
pavgb mm5, [edi+1]
|
|
pavgb mm4, mm5
|
|
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
|
|
paddd mm0, mm4 ; accumulate difference
|
|
|
|
movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1)
|
|
pavgb mm6, [edi+8] ; Interpolate
|
|
movq mm7, [eax+9]
|
|
pavgb mm7, [edi+9]
|
|
pavgb mm6, mm7
|
|
psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
|
|
paddd mm0, mm6 ; accumulate difference
|
|
|
|
add eax, edx ; update pointer to next row
|
|
add ebx, edx ; ditto
|
|
add edi, edx
|
|
|
|
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
|
|
pavgb mm4, [edi] ; Interpolate...
|
|
movq mm5, [eax+1]
|
|
pavgb mm5, [edi+1]
|
|
pavgb mm4, mm5
|
|
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
|
|
paddd mm0, mm4 ; accumulate difference
|
|
|
|
movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1)
|
|
pavgb mm6, [edi+8] ; Interpolate
|
|
movq mm7, [eax+9]
|
|
pavgb mm7, [edi+9]
|
|
pavgb mm6, mm7
|
|
psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
|
|
paddd mm0, mm6 ; accumulate difference
|
|
|
|
add eax, edx ; update pointer to next row
|
|
add ebx, edx ; ditto
|
|
add edi, edx
|
|
|
|
|
|
sub ecx, 2 ; check rowsleft
|
|
jnz near nextrow11 ; rinse and repeat
|
|
|
|
movd eax, mm0 ; store return value
|
|
|
|
pop edi
|
|
pop edx
|
|
pop ecx
|
|
pop ebx
|
|
|
|
pop ebp ; restore stack pointer
|
|
|
|
emms ; clear mmx registers
|
|
ret ; we now return you to your regular programming
|
|
|
|
global sad_sub22_mmxe
|
|
|
|
; int sad_sub22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh);
|
|
|
|
; eax = p1
|
|
; ebx = p2
|
|
; ecx = counter temp
|
|
; edx = flx;
|
|
|
|
; mm0 = distance accumulator
|
|
; mm2 = rowsleft
|
|
; mm3 = 2 (rows per loop)
|
|
; mm4 = temp
|
|
; mm5 = temp
|
|
; mm6 = temp
|
|
|
|
|
|
align 32
|
|
sad_sub22_mmxe:
|
|
push ebp ; save frame pointer
|
|
mov ebp, esp
|
|
|
|
push ebx
|
|
push ecx
|
|
push edx
|
|
|
|
pxor mm0, mm0 ; zero acculumator
|
|
|
|
mov eax, [ebp+8] ; get p1
|
|
mov ebx, [ebp+12] ; get p2
|
|
mov edx, [ebp+16] ; get lx
|
|
|
|
mov ecx, [ebp+20]
|
|
jmp nextrowfd
|
|
align 32
|
|
nextrowfd:
|
|
movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
|
|
add eax, edx ; update pointer to next row
|
|
psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
|
|
add ebx, edx ; ditto
|
|
paddd mm0, mm4 ; accumulate difference
|
|
|
|
|
|
movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
|
|
add eax, edx ; update pointer to next row
|
|
psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
|
|
add ebx, edx ; ditto
|
|
paddd mm0, mm6 ; accumulate difference
|
|
|
|
|
|
sub ecx, 2
|
|
jnz nextrowfd
|
|
|
|
movd eax, mm0
|
|
|
|
pop edx
|
|
pop ecx
|
|
pop ebx
|
|
|
|
pop ebp
|
|
|
|
emms
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
global sad_sub44_mmxe
|
|
|
|
; int sad_sub44_mmxe(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
|
|
|
|
; eax = p1
|
|
; ebx = p2
|
|
; ecx = temp
|
|
; edx = qlx;
|
|
; esi = rowsleft
|
|
|
|
; mm0 = distance accumulator left block p1
|
|
; mm1 = distance accumulator right block p1
|
|
; mm2 = 0
|
|
; mm3 = 0
|
|
; mm4 = temp
|
|
; mm5 = temp
|
|
; mm6 = temp
|
|
|
|
|
|
align 32
|
|
sad_sub44_mmxe:
|
|
push ebp
|
|
mov ebp, esp
|
|
|
|
push ebx
|
|
push ecx
|
|
push edx
|
|
push esi
|
|
|
|
pxor mm0, mm0 ; zero acculumator
|
|
pxor mm1, mm1
|
|
pxor mm2, mm2
|
|
mov eax, [ebp+8] ; get p1
|
|
mov ebx, [ebp+12] ; get p2
|
|
mov edx, [ebp+16] ; get qlx
|
|
|
|
mov esi, [ebp+20] ; get rowsleft
|
|
jmp nextrowqd ; snap to it
|
|
align 32
|
|
nextrowqd:
|
|
movq mm4, [eax] ; load 8 bytes of p1 (two blocks!)
|
|
add eax, edx ; update pointer to next row
|
|
movq mm6, mm4 ;
|
|
mov ecx, [ebx] ; load 4 bytes of p2
|
|
punpcklbw mm4, mm2 ; mm4 = bytes 0..3 p1 (spaced out)
|
|
movd mm5, ecx
|
|
punpcklbw mm5, mm2 ; mm5 = bytes 0..3 p2 (spaced out)
|
|
psadbw mm4, mm5 ; compare to left block
|
|
add ebx, edx ; ditto
|
|
|
|
; punpckhbw mm6, mm2 ; mm6 = bytes 4..7 p1 (spaced out)
|
|
|
|
paddd mm0, mm4 ; accumulate difference left block
|
|
|
|
; psadbw mm6,mm5 ; compare to right block
|
|
|
|
|
|
; paddd mm1, mm6 ; accumulate difference right block
|
|
|
|
sub esi, 1
|
|
jnz nextrowqd
|
|
|
|
movd eax, mm0
|
|
; movd ebx, mm1
|
|
; sal ebx, 16
|
|
; or eax, ebx
|
|
|
|
pop esi
|
|
pop edx
|
|
pop ecx
|
|
pop ebx
|
|
|
|
pop ebp ; restore stack pointer
|
|
|
|
emms ; clear mmx registers
|
|
ret ; we now return you to your regular programming
|
|
|
|
|
|
;;;
|
|
;;; mblock_*nearest4_sad_mmxe.s:
|
|
;;;
|
|
;;; Enhanced MMX optimized Sum Absolute Differences routines for
|
|
;;; quads macroblocks offset by (0,0) (0,1) (1,0) (1,1) pel
|
|
;;;
|
|
|
|
;;; Explanation: the motion compensation search at 1-pel and 2*2 sub-sampled
|
|
;;; evaluates macroblock quads. A lot of memory accesses can be saved
|
|
;;; if each quad is done together rather than each macroblock in the
|
|
;;; quad handled individually.
|
|
|
|
;;; TODO: Really there ought to be MMX versions and the function's
|
|
;;; specification should be documented...
|
|
;
|
|
; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
|
|
|
|
|
|
;;; CURRENTLY not used but used in testing as reference for tweaks...
|
|
global mblockq_sad_REF
|
|
|
|
; void mblockq_sad_REF(char *blk1,char *blk2,int lx,int h,int *weightvec);
|
|
; eax = p1
|
|
; ebx = p2
|
|
; ecx = unused
|
|
; edx = lx;
|
|
; edi = rowsleft
|
|
; esi = h
|
|
|
|
; mm0 = SAD (x+0,y+0)
|
|
; mm1 = SAD (x+2,y+0)
|
|
; mm2 = SAD (x+0,y+2)
|
|
; mm3 = SAD (x+2,y+2)
|
|
; mm4 = temp
|
|
; mm5 = temp
|
|
; mm6 = temp
|
|
; mm7 = temp
|
|
|
|
align 32
|
|
mblockq_sad_REF:
|
|
push ebp ; save frame pointer
|
|
mov ebp, esp ; link
|
|
push eax
|
|
push ebx
|
|
push ecx
|
|
push edx
|
|
push edi
|
|
push esi
|
|
|
|
pxor mm0, mm0 ; zero accumulators
|
|
pxor mm1, mm1
|
|
pxor mm2, mm2
|
|
pxor mm3, mm3
|
|
mov eax, [ebp+8] ; get p1
|
|
mov ebx, [ebp+12] ; get p2
|
|
mov edx, [ebp+16] ; get lx
|
|
|
|
mov edi, [ebp+20] ; get rowsleft
|
|
mov esi, edi
|
|
|
|
jmp nextrow_block_d1
|
|
align 32
|
|
nextrow_block_d1:
|
|
|
|
;; Do the (+0,+0) SAD
|
|
|
|
movq mm4, [eax] ; load 1st 8 bytes of p1
|
|
movq mm6, mm4
|
|
movq mm5, [ebx]
|
|
psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
|
|
paddd mm0, mm4 ; accumulate difference
|
|
movq mm4, [eax+8] ; load 2nd 8 bytes of p1
|
|
movq mm7, mm4
|
|
psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
|
|
paddd mm0, mm4 ; accumulate difference
|
|
|
|
|
|
cmp edi, esi
|
|
jz firstrow0
|
|
|
|
;; Do the (0,+2) SAD
|
|
sub ebx, edx
|
|
psadbw mm6, [ebx] ; compare to next 8 bytes of p2 (row 1)
|
|
paddd mm2, mm6 ; accumulate difference
|
|
psadbw mm7, [ebx+8] ; next 8 bytes of p1 (row 1)
|
|
add ebx, edx
|
|
paddd mm2, mm7
|
|
|
|
firstrow0:
|
|
|
|
;; Do the (+2,0) SAD
|
|
|
|
movq mm4, [eax+1]
|
|
|
|
movq mm6, mm4
|
|
psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
|
|
paddd mm1, mm4 ; accumulate difference
|
|
movq mm4, [eax+9]
|
|
movq mm7, mm4
|
|
psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
|
|
paddd mm1, mm4 ; accumulate difference
|
|
|
|
cmp edi, esi
|
|
jz firstrow1
|
|
|
|
;; Do the (+2, +2 ) SAD
|
|
sub ebx, edx
|
|
psadbw mm6, [ebx] ; compare to 1st 8 bytes of prev p2
|
|
psadbw mm7, [ebx+8] ; 2nd 8 bytes of prev p2
|
|
add ebx, edx
|
|
paddd mm3, mm6 ; accumulate difference
|
|
paddd mm3, mm7
|
|
firstrow1:
|
|
|
|
add eax, edx ; update pointer to next row
|
|
add ebx, edx ; ditto
|
|
|
|
sub edi, 1
|
|
jnz near nextrow_block_d1
|
|
|
|
;; Do the last row of the (0,+2) SAD
|
|
|
|
movq mm4, [eax] ; load 1st 8 bytes of p1
|
|
movq mm5, [eax+8] ; load 2nd 8 bytes of p1
|
|
sub ebx, edx
|
|
psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1)
|
|
psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1)
|
|
paddd mm2, mm4 ; accumulate difference
|
|
paddd mm2, mm5
|
|
|
|
movq mm4, [eax+1]
|
|
movq mm5, [eax+9]
|
|
|
|
;; Do the last row of rhw (+2, +2) SAD
|
|
psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2
|
|
psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2
|
|
paddd mm3, mm4 ; accumulate difference
|
|
paddd mm3, mm5
|
|
|
|
|
|
mov eax, [ebp+24] ; Weightvec
|
|
movd [eax+0], mm0
|
|
movd [eax+4], mm1
|
|
movd [eax+8], mm2
|
|
movd [eax+12], mm3
|
|
|
|
pop esi
|
|
pop edi
|
|
pop edx
|
|
pop ecx
|
|
pop ebx
|
|
pop eax
|
|
|
|
pop ebp
|
|
emms
|
|
ret
|
|
|
|
|
|
|
|
global mblock_nearest4_sads_mmxe
|
|
|
|
; void mblock_nearest4_sads_mmxe(char *blk1,char *blk2,int lx,int h,int *weightvec);
|
|
|
|
; eax = p1
|
|
; ebx = p2
|
|
; ecx = unused
|
|
; edx = lx;
|
|
; edi = rowsleft
|
|
; esi = h
|
|
|
|
; mm0 = SAD (x+0,y+0),SAD (x+0,y+2)
|
|
; mm1 = SAD (x+2,y+0),SAD (x+2,y+2)
|
|
|
|
; mm4 = temp
|
|
; mm5 = temp
|
|
; mm6 = temp
|
|
; mm7 = temp
|
|
|
|
align 32
|
|
mblock_nearest4_sads_mmxe:
|
|
push ebp ; save frame pointer
|
|
mov ebp, esp ; link
|
|
push eax
|
|
push ebx
|
|
push ecx
|
|
push edx
|
|
push edi
|
|
push esi
|
|
|
|
mov eax, [ebp+8] ; get p1
|
|
prefetcht0 [eax]
|
|
pxor mm0, mm0 ; zero accumulators
|
|
pxor mm1, mm1
|
|
mov ebx, [ebp+12] ; get p2
|
|
mov edx, [ebp+16] ; get lx
|
|
|
|
mov edi, [ebp+20] ; get rowsleft
|
|
mov esi, edi
|
|
|
|
jmp nextrow_block_e1
|
|
align 32
|
|
nextrow_block_e1:
|
|
|
|
;; Do the (+0,+0) SAD
|
|
prefetcht0 [eax+edx]
|
|
movq mm4, [eax] ; load 1st 8 bytes of p1
|
|
movq mm6, mm4
|
|
movq mm5, [ebx]
|
|
psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
|
|
paddd mm0, mm4 ; accumulate difference
|
|
movq mm4, [eax+8] ; load 2nd 8 bytes of p1
|
|
movq mm7, mm4
|
|
psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
|
|
paddd mm0, mm4 ; accumulate difference
|
|
|
|
|
|
cmp edi, esi
|
|
jz firstrowe0
|
|
|
|
;; Do the (0,+2) SAD
|
|
sub ebx, edx
|
|
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
movq mm2, [ebx]
|
|
psadbw mm6, mm2 ; compare to next 8 bytes of p2 (row 1)
|
|
paddd mm0, mm6 ; accumulate difference
|
|
movq mm3, [ebx+8]
|
|
psadbw mm7, mm3 ; next 8 bytes of p1 (row 1)
|
|
add ebx, edx
|
|
paddd mm0, mm7
|
|
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
firstrowe0:
|
|
|
|
;; Do the (+2,0) SAD
|
|
|
|
movq mm4, [eax+1]
|
|
movq mm6, mm4
|
|
|
|
psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
|
|
paddd mm1, mm4 ; accumulate difference
|
|
|
|
movq mm4, [eax+9]
|
|
movq mm7, mm4
|
|
|
|
psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
|
|
paddd mm1, mm4 ; accumulate difference
|
|
|
|
cmp edi, esi
|
|
jz firstrowe1
|
|
|
|
;; Do the (+2, +2 ) SAD
|
|
sub ebx, edx
|
|
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
psadbw mm6, mm2 ; compare to 1st 8 bytes of prev p2
|
|
psadbw mm7, mm3 ; 2nd 8 bytes of prev p2
|
|
add ebx, edx
|
|
paddd mm1, mm6 ; accumulate difference
|
|
paddd mm1, mm7
|
|
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
firstrowe1:
|
|
|
|
add eax, edx ; update pointer to next row
|
|
add ebx, edx ; ditto
|
|
|
|
sub edi, 1
|
|
jnz near nextrow_block_e1
|
|
|
|
;; Do the last row of the (0,+2) SAD
|
|
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
movq mm4, [eax] ; load 1st 8 bytes of p1
|
|
movq mm5, [eax+8] ; load 2nd 8 bytes of p1
|
|
sub ebx, edx
|
|
psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1)
|
|
psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1)
|
|
paddd mm0, mm4 ; accumulate difference
|
|
paddd mm0, mm5
|
|
|
|
|
|
;; Do the last row of rhw (+2, +2) SAD
|
|
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
movq mm4, [eax+1]
|
|
movq mm5, [eax+9]
|
|
|
|
psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2
|
|
psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2
|
|
paddd mm1, mm4 ; accumulate difference
|
|
paddd mm1, mm5
|
|
|
|
|
|
mov eax, [ebp+24] ; Weightvec
|
|
movd [eax+8], mm0
|
|
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
movd [eax+12], mm1
|
|
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
movd [eax+0], mm0
|
|
movd [eax+4], mm1
|
|
|
|
pop esi
|
|
pop edi
|
|
pop edx
|
|
pop ecx
|
|
pop ebx
|
|
pop eax
|
|
|
|
pop ebp
|
|
emms
|
|
ret
|
|
|
|
global mblock_sub22_nearest4_sads_mmxe
|
|
|
|
; void mblock_sub22_nearest4_sads_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh, int* resvec);
|
|
|
|
; eax = p1
|
|
; ebx = p2
|
|
; ecx = counter temp
|
|
; edx = flx;
|
|
|
|
; mm0 = distance accumulator
|
|
; mm1 = distance accumulator
|
|
; mm2 = previous p1 row
|
|
; mm3 = previous p1 displaced by 1 byte...
|
|
; mm4 = temp
|
|
; mm5 = temp
|
|
; mm6 = temp
|
|
; mm7 = temp / 0 if first row 0xff otherwise
|
|
|
|
|
|
align 32
|
|
mblock_sub22_nearest4_sads_mmxe:
|
|
push ebp ; save frame pointer
|
|
mov ebp, esp
|
|
push eax
|
|
push ebx
|
|
push ecx
|
|
push edx
|
|
|
|
pxor mm0, mm0 ; zero acculumator
|
|
pxor mm1, mm1 ; zero acculumator
|
|
pxor mm2, mm2 ; zero acculumator
|
|
pxor mm3, mm3 ; zero acculumator
|
|
|
|
mov eax, [ebp+8] ; get p1
|
|
mov ebx, [ebp+12] ; get p2
|
|
mov edx, [ebp+16] ; get lx
|
|
mov ecx, [ebp+20]
|
|
movq mm2, [eax+edx]
|
|
movq mm3, [eax+edx+1]
|
|
jmp nextrowbd22
|
|
align 32
|
|
nextrowbd22:
|
|
movq mm5, [ebx] ; load previous row reference block
|
|
; mm2 /mm3 containts current row target block
|
|
|
|
psadbw mm2, mm5 ; Comparse (x+0,y+2)
|
|
paddd mm1, mm2
|
|
|
|
psadbw mm3, mm5 ; Compare (x+2,y+2)
|
|
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
paddd mm1, mm3
|
|
|
|
pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
|
|
movq mm2, [eax] ; Load current row traget block into mm2 / mm3
|
|
movq mm6, mm2
|
|
movq mm3, [eax+1]
|
|
sub eax, edx
|
|
sub ebx, edx
|
|
prefetcht0 [eax]
|
|
movq mm7, mm3
|
|
|
|
psadbw mm6, mm5 ; Compare (x+0,y+0)
|
|
paddd mm0, mm6
|
|
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
psadbw mm7, mm5 ; Compare (x+2,y+0)
|
|
paddd mm0, mm7
|
|
pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
|
|
|
|
sub ecx, 1
|
|
jnz nextrowbd22
|
|
|
|
mov eax, [ebp+24]
|
|
movq [eax+0], mm0
|
|
movq [eax+8], mm1
|
|
pop edx
|
|
pop ecx
|
|
pop ebx
|
|
pop eax
|
|
pop ebp
|
|
|
|
emms
|
|
ret
|
|
|
|
|