Files
veejay/veejay-2005/utils/mblock_sumsq_mmx.s
Niels Elburg c84f6ca821 moved veejay-current to veejay-2005 (branch closed)
git-svn-id: svn://code.dyne.org/veejay/trunk@509 eb8d1916-c9e9-0310-b8de-cf0c9472ead5
2006-01-20 16:43:25 +00:00

674 lines
13 KiB
ArmAsm

;
; dist2_mmx.s: mmX optimized squared distance sum
;
; Original believed to be Copyright (C) 2000 Brent Byeler
;
; This program is free software; you can reaxstribute it and/or
; modify it under the terms of the GNU General Public License
; as published by the Free Software Foundation; either version 2
; of the License, or (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, write to the Free Software
; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
;
; total squared difference between two (16*h) blocks
; including optional half pel interpolation of [ebp+8] ; blk1 (hx,hy)
; blk1,blk2: addresses of top left pels of both blocks
; lx: distance (in bytes) of vertically adjacent pels
; hx,hy: flags for horizontal and/or vertical interpolation
; h: height of block (usually 8 or 16)
; mmX version
global sumsq_mmx
; int sumsq_mmx(unsigned char *blk1, unsigned char *blk2,
; int lx, int hx, int hy, int h)
; mm7 = 0
; eax = pblk1
; ebx = pblk2
; ecx = temp
; edx = distance_sum
; edi = h
; esi = lx
;;
;; private constants needed
;;
SECTION .data
align 16
twos:
dw 2
dw 2
dw 2
dw 2
SECTION .text
align 32
sumsq_mmx:
push ebp ; save frame pointer
mov ebp, esp ; link
push ebx
push ecx
push edx
push esi
push edi
mov esi, [ebp+16] ; lx
mov eax, [ebp+20] ; hx
mov edx, [ebp+24] ; hy
mov edi, [ebp+28] ; h
pxor mm5, mm5 ; sum
test edi, edi ; h = 0?
jle near d2exit
pxor mm7, mm7 ; get zeros i mm7
test eax, eax ; hx != 0?
jne near d2is10
test edx, edx ; hy != 0?
jne near d2is10
mov eax, [ebp+8]
mov ebx, [ebp+12]
jmp d2top00
align 32
d2top00:
movq mm0, [eax]
movq mm1, mm0
punpcklbw mm0, mm7
punpckhbw mm1, mm7
movq mm2, [ebx]
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
psubw mm0, mm2
psubw mm1, mm3
pmaddwd mm0, mm0
pmaddwd mm1, mm1
paddd mm0, mm1
movq mm1, [eax+8]
movq mm2, mm1
punpcklbw mm1, mm7
punpckhbw mm2, mm7
movq mm3, [ebx+8]
movq mm4, mm3
punpcklbw mm3, mm7
punpckhbw mm4, mm7
psubw mm1, mm3
psubw mm2, mm4
pmaddwd mm1, mm1
pmaddwd mm2, mm2
paddd mm1, mm2
paddd mm0, mm1
;; Accumulate sum in edx... we use mm5
;movd ecx, mm0
;add edx, ecx
;psrlq mm0, 32
;movd ecx, mm0
;add edx, ecx
paddd mm5, mm0
add eax, esi
add ebx, esi
dec edi
jg d2top00
jmp near d2exit
;jmp d2exit changes made because of problems with nasm 0.98.23
d2is10:
test eax, eax
je near d2is01
test edx, edx
jne near d2is01
mov eax, [ebp+8] ; blk1
mov ebx, [ebp+12] ; blk1
pxor mm6, mm6 ; mm6 = 0 and isn't changed anyplace in the loop..
pcmpeqw mm1, mm1
psubw mm6, mm1
jmp d2top10
align 32
d2top10:
movq mm0, [eax]
movq mm1, mm0
punpcklbw mm0, mm7
punpckhbw mm1, mm7
movq mm2, [eax+1]
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
paddw mm0, mm2
paddw mm1, mm3
paddw mm0, mm6 ; here we add mm6 = 0.... weird...
paddw mm1, mm6
psrlw mm0, 1
psrlw mm1, 1
movq mm2, [ebx]
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
psubw mm0, mm2
psubw mm1, mm3
pmaddwd mm0, mm0
pmaddwd mm1, mm1
paddd mm0, mm1
movq mm1, [eax+8]
movq mm2, mm1
punpcklbw mm1, mm7
punpckhbw mm2, mm7
movq mm3, [eax+9]
movq mm4, mm3
punpcklbw mm3, mm7
punpckhbw mm4, mm7
paddw mm1, mm3
paddw mm2, mm4
paddw mm1, mm6
paddw mm2, mm6
psrlw mm1, 1
psrlw mm2, 1
movq mm3, [ebx+8]
movq mm4, mm3
punpcklbw mm3, mm7
punpckhbw mm4, mm7
psubw mm1, mm3
psubw mm2, mm4
pmaddwd mm1, mm1
pmaddwd mm2, mm2
paddd mm1, mm2
paddd mm0, mm1
; Accumulate mm0 sum on edx... we'll use mm5 for this and add up at the end
; movd ecx, mm0
; add edx, ecx
; psrlq mm0, 32
; movd ecx, mm0
; add edx, ecx
paddd mm5, mm0
add eax, esi
add ebx, esi
dec edi
jg near d2top10
jmp near d2exit
;jmp d2exit changes made because of problems with nasm 0.98.23
d2is01:
test eax, eax
jne near d2is11
test edx, edx
je near d2is11
mov eax, [ebp+8] ; blk1
mov edx, [ebp+12] ; blk2
mov ebx, eax
add ebx, esi ; blk1 + lx
pxor mm6, mm6
pcmpeqw mm1, mm1
psubw mm6, mm1 ; mm6 = 1
jmp d2top01
align 32
d2top01:
movq mm0, [eax]
movq mm1, mm0
punpcklbw mm0, mm7
punpckhbw mm1, mm7
movq mm2, [ebx]
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
paddw mm0, mm2
paddw mm1, mm3
paddw mm0, mm6
paddw mm1, mm6
psrlw mm0, 1
psrlw mm1, 1
movq mm2, [edx]
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
psubw mm0, mm2
psubw mm1, mm3
pmaddwd mm0, mm0
pmaddwd mm1, mm1
paddd mm0, mm1
movq mm1, [eax+8]
movq mm2, mm1
punpcklbw mm1, mm7
punpckhbw mm2, mm7
movq mm3, [ebx+8]
movq mm4, mm3
punpcklbw mm3, mm7
punpckhbw mm4, mm7
paddw mm1, mm3
paddw mm2, mm4
paddw mm1, mm6
paddw mm2, mm6
psrlw mm1, 1
psrlw mm2, 1
movq mm3, [edx+8]
movq mm4, mm3
punpcklbw mm3, mm7
punpckhbw mm4, mm7
psubw mm1, mm3
psubw mm2, mm4
pmaddwd mm1, mm1
pmaddwd mm2, mm2
paddd mm0, mm1
paddd mm0, mm2
paddd mm5, mm0
mov eax, ebx ; eax = eax + lx
add edx, esi ; edx = edx + lx
add ebx, esi ; ebx = ebx + lx
dec edi
jg near d2top01
jmp near d2exit
;jmp d2exit changes made because of problems with nasm 0.98.23
d2is11:
mov eax, [ebp+8] ; blk1
mov edx, [ebp+12] ; blk2
mov ebx, eax ; blk1
add ebx, esi ; ebx = blk1 + lx
jmp d2top11
align 32
d2top11:
movq mm0, [eax]
movq mm1, mm0
punpcklbw mm0, mm7
punpckhbw mm1, mm7
movq mm2, [eax+1]
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
paddw mm0, mm2
paddw mm1, mm3
movq mm2, [ebx]
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
movq mm4, [ebx+1]
movq mm6, mm4
punpcklbw mm4, mm7
punpckhbw mm6, mm7
paddw mm2, mm4
paddw mm3, mm6
paddw mm0, mm2
paddw mm1, mm3
movq mm6, [twos]
paddw mm0, mm6 ; round mm0
paddw mm1, mm6 ; round mm1
psrlw mm0, 2
psrlw mm1, 2
movq mm2, [edx]
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
psubw mm0, mm2
psubw mm1, mm3
pmaddwd mm0, mm0
pmaddwd mm1, mm1
paddd mm0, mm1
movq mm1, [eax+8]
movq mm2, mm1
punpcklbw mm1, mm7
punpckhbw mm2, mm7
movq mm3, [eax+9]
movq mm4, mm3
punpcklbw mm3, mm7
punpckhbw mm4, mm7
paddw mm1, mm3
paddw mm2, mm4
movq mm3, [ebx+8]
movq mm4, mm3
punpcklbw mm3, mm7
punpckhbw mm4, mm7
paddw mm1, mm3
paddw mm2, mm4
movq mm3, [ebx+9]
movq mm4, mm3
punpcklbw mm3, mm7
punpckhbw mm4, mm7
paddw mm1, mm3
paddw mm2, mm4
movq mm6, [twos]
paddw mm1, mm6
paddw mm2, mm6
psrlw mm1, 2
psrlw mm2, 2
movq mm3, [edx+8]
movq mm4, mm3
punpcklbw mm3, mm7
punpckhbw mm4, mm7
psubw mm1, mm3
psubw mm2, mm4
pmaddwd mm1, mm1
pmaddwd mm2, mm2
paddd mm1, mm2
paddd mm0, mm1
;;
;; Accumulate the result in "s" we use mm6 for the purpose...
paddd mm5, mm0
mov eax, ebx ; ahem ebx = eax at start of loop and wasn't changed...
add ebx, esi
add edx, esi
dec edi
jg near d2top11
d2exit:
;; Put the final sum in eax for return...
movd eax, mm5
psrlq mm5, 32
movd ecx, mm5
add eax, ecx
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop ebp ; restore stack pointer
emms ; clear mmx registers
ret
; total squared difference between two (8*h) blocks
; blk1,blk2: addresses of top left pels of both blocks
; lx: distance (in bytes) of vertically adjacent pels
; h: height of block (usually 4, or 8)
; mmX version
global sumsq_sub22_mmx
; int sumsq_sub22_mmx(unsigned char *blk1, unsigned char *blk2,
; int lx, int h)
; mm7 = 0
; eax = pblk1
; ebx = pblk2
; ecx = temp
; edx = distance_sum
; edi = h
; esi = lx
align 32
sumsq_sub22_mmx:
push ebp ; save frame pointer
mov ebp, esp ; link
push ebx
push ecx
push edx
push esi
push edi
mov esi, [ebp+16] ; lx
mov edi, [ebp+20] ; h
pxor mm5, mm5 ; sum
test edi, edi ; h = 0?
jle near d2exit
pxor mm7, mm7 ; get zeros i mm7
mov eax, [ebp+8] ; blk1
mov ebx, [ebp+12] ; blk2
jmp d2top22
align 32
d2top22:
movq mm0, [eax]
movq mm1, mm0
punpcklbw mm0, mm7
punpckhbw mm1, mm7
movq mm2, [ebx]
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
psubw mm0, mm2
psubw mm1, mm3
pmaddwd mm0, mm0
pmaddwd mm1, mm1
paddd mm5, mm0
paddd mm5, mm1
add eax, esi
add ebx, esi
dec edi
jg d2top22
jmp d2exit
; total squared difference between interpolation of two (8*h) blocks and
; another 8*h block
; blk1,blk2: addresses of top left pels of both blocks
; lx: distance (in bytes) of vertically adjacent pels
; h: height of block (usually 4, or 8)
; mmX version
global bsumsq_sub22_mmx
; int bsumsq_sub22_mmx(unsigned char *blk1f, unsigned char*blk1b,
; unsigned char *blk2,
; int lx, int h)
; mm7 = 0
; eax = pblk1f
; ebx = pblk2
; ecx = pblk1b
; edx = distance_sum
; edi = h
; esi = lx
align 32
bsumsq_sub22_mmx:
push ebp ; save frame pointer
mov ebp, esp ; link
push ebx
push ecx
push edx
push esi
push edi
mov esi, [ebp+20] ; lx
mov edi, [ebp+24] ; h
pxor mm5, mm5 ; sum
test edi, edi ; h = 0?
jle near d2exit
pxor mm7, mm7 ; get zeros i mm7
mov eax, [ebp+8] ; blk1f
mov ebx, [ebp+12] ; blk1b
mov ecx, [ebp+16] ; blk2
jmp bd2top22
align 32
bd2top22:
movq mm0, [eax]
movq mm1, mm0
movq mm4, [ebx]
movq mm6, mm4
punpcklbw mm0, mm7
punpckhbw mm1, mm7
punpcklbw mm4, mm7
punpckhbw mm6, mm7
movq mm2, [ecx]
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
paddw mm0, mm4
psrlw mm0, 1
psubw mm0, mm2
pmaddwd mm0, mm0
paddw mm1, mm6
psrlw mm1, 1
psubw mm1, mm3
pmaddwd mm1, mm1
paddd mm5, mm0
paddd mm5, mm1
add eax, esi
add ebx, esi
add ecx, esi
dec edi
jg bd2top22
jmp d2exit
global variance_mmx
;;; variance of a (size*size) block, multiplied by 256
;;; p: address of top left pel of block
;;; lx: seperation (in bytes) of vertically adjacent pels
;;; NOTE: size is 8 or 16
;;; int variance_mmx(uint8_t *p, int size, int lx)
variance_mmx:
push ebp ; save frame pointer
mov ebp, esp ; link
push ebx
push ecx
push edx
mov eax, 0 ; col offset...
mov edx, [ebp+16] ; lx
pxor mm0, mm0
pxor mm7, mm7 ; Zero sum accumulator (4 words)
pxor mm6, mm6 ; Zero squares accumulator (2 dwords)
varcols:
mov ebx, [ebp+8] ; p
mov ecx, [ebp+12] ; size
varrows:
movq mm2, [ebx+eax]
movq mm3, mm2
punpcklbw mm2, mm0
punpckhbw mm3, mm0
movq mm4, mm2
movq mm5, mm3
pmaddwd mm4, mm2 ; Squares 0:3
paddw mm7, mm2 ; Accumulate sum 0:3 (words)
paddd mm6, mm4 ; Accumulate sum squares 0:3 (dwords)
pmaddwd mm5, mm3 ; Squares 4:7 (words)
paddw mm7, mm3 ; Accumulate sum 0:3 (words)
paddd mm6, mm5 ; Accumulate sum squares 4:7 (dwords)
add ebx, edx ; Next row
dec ecx
jnz varrows
add eax, 8
cmp eax, [ebp+12]
jl varcols
;; Sum squared -> eax
movq mm1, mm7
psrlq mm1, 32
paddw mm7, mm1
movq mm1, mm7
psrlq mm1, 16
paddw mm7, mm1
movd eax, mm7
and eax, 0xffff
imul eax, eax
;; Squares sum -> ecx
movq mm1, mm6
psrlq mm1, 32
paddd mm6, mm1
movd ecx, mm6
mov ebx, [ebp+12]
shr eax, 6 ; Divide sum squared by 64 for 8*8
cmp ebx, 8
jz var8_8 ; If 16 * 16 divide again by 4 (256)
shr eax, 2
var8_8:
sub ecx, eax
mov eax, ecx
pop edx
pop ecx
pop ebx
pop ebp ; restore stack pointer
emms ; clear mmx registers
ret