Files
veejay/veejay-30aug04/veejay-0.6/ccvt/ccvt_mmx.S
Niels Elburg 8ee2c5578f Initial revision
git-svn-id: svn://code.dyne.org/veejay/trunk@3 eb8d1916-c9e9-0310-b8de-cf0c9472ead5
2004-07-30 18:46:12 +00:00

344 lines
8.1 KiB
ArmAsm

/* CCVT: ColourConVerT: simple library for converting colourspaces
Copyright (C) 2002 Nemosoft Unv.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
For questions, remarks, patches, etc. for this program, the author can be
reached at nemosoft@smcc.demon.nl.
*/
/* The ccvt_* functions always have 4 paramaters:
width 8(%ebp)
height 12(%ebp)
src 16(%ebp)
dst 20(%ebp)
*/
#define __ASSEMBLY__
#include <linux/linkage.h>
#define Width 8(%ebp)
#define Height 12(%ebp)
/* 2 parameters, 1 in, 1 out */
#define Src 16(%ebp)
#define Dst 20(%ebp)
/* The buffer space is in the MMX registers :-)
We only need the U and V pointers on the stack
*/
#define Uptr -4(%ebp)
#define Vptr -8(%ebp)
.data
/* Some constants used during processing */
mm_0: .byte 0, 0, 0, 0, 0, 0, 0, 0
mm_128: .byte 128, 128, 128, 128, 128, 128, 128, 128
mm_mask: .byte 255, 255, 255, 0, 255, 255, 255, 0
mm_low: .byte 255, 255, 255, 255, 0, 0, 0, 0
mm_high: .byte 0, 0, 0, 0, 255, 255, 255, 255
/* Multiplication factors for Vb, Vg, Ug, Ur */
mm_mul_bgr: .word 454, -88, -183, 359
mm_mul_rgb: .word 359, -183, -88, 454
.text
/* This function will load the src and destination pointers, and test the
width/height parameters.
- %esi will be set to Src
- %edi will be set to Dst
the carry flag will be set if any of these tests fail.
It assumes %ebp has been set.
*/
test_params:
mov Src, %esi
mov Dst, %edi
cmp $0, %esi # NULL pointers?
je param_fail
cmp $0, %edi
je param_fail
test_width_height:
cmpl $0, Width
jbe param_fail
testl $1, Width # Odd no. of columns?
jnz param_fail # Aye
cmp $0, Height
jbe param_fail
testl $1, Height # Odd no. of lines?
jnz param_fail # Aye
/* fall through */
/* exit points */
param_ok:
clc # Success: clear carry
ret
param_fail:
stc # Fail: set carry
ret
# Our output is YUV; UV-pointers are relative to edi
param_yuv_dst:
mov Width, %eax # add width * height to Y ptr, set in U
mull Height
mov %edi, Uptr # U = Y
mov %edi, Vptr # V = Y
add %eax, Uptr # U = Y + w*h
add %eax, Vptr # V = Y + w*h
shr $2, %eax
add %eax, Vptr # V = Y + w*h + (w*h)/4
ret
# Our input is YUV; UV-pointers are relative to esi
param_yuv_src:
mov Width, %eax # add width * height to Y ptr, set in U
mull Height
mov %esi, Uptr # U = Y
mov %esi, Vptr # V = Y
add %eax, Uptr # U = Y + w*h
add %eax, Vptr # V = Y + w*h
shr $2, %eax
add %eax, Vptr # V = Y + w*h + (w*h)/4
ret
/*************************************/
.macro ENTER_FUNC
enter $8, $0 # 8 bytes for UV pointers
push %ebx
push %esi
push %edi
call test_params
jc 9f
.endm
.macro START_LOOP_YUV order
call param_yuv_src # our input is YUVp
mov Width, %ebx # Use in offset calculation for Y2 / Dst2
shrl $1, Height # Only half the lines
shrl $1, Width # Only half the columns
movq mm_128, %mm6 # load constants
movq mm_mask, %mm5
movq mm_mul_\order, %mm4
0: mov Width, %ecx # number of loops
.endm
.macro LOAD_MUL_UV is_rgb=0
1: push %ebx
// Section A1: load and prepare UV values
mov Uptr, %ebx
movzbl (%ebx), %eax # load U byte
inc %ebx
mov %al, %ah # duplicate byte
mov %ebx, Uptr # Faster than "incl Uptr"
mov Vptr, %ebx
movzbl (%ebx), %edx # load U byte
inc %ebx
mov %dl, %dh # duplicate byte
mov %ebx, Vptr
.if \is_rgb
shl $16, %eax
or %edx, %eax # move to lower 16 bits of eax
movd %eax, %mm2 # 00 00 00 00 UU UU VV VV
.else
shl $16, %edx
or %eax, %edx # move to lower 16 bits of edx
movd %edx, %mm2 # 00 00 00 00 VV VV UU UU
.endif
// Section A2: multiply UV values and shuffle
// Note: byte orders shown in the MMX registers are for BGR order
psubb %mm6, %mm2 # -128
punpcklbw mm_0, %mm2 # 00 VV 00 VV 00 UU 00 UU
psllw $8, %mm2 # VV 00 VV 00 UU 00 UU 00
pmulhw %mm4, %mm2 # multiply with factors, signed
movq %mm2, %mm7 # vr vr vg vg ug ug ub ub
pand mm_low, %mm7 # 00 00 00 00 ug ug ub ub
pand mm_high, %mm2 # vr vr vg vg 00 00 00 00
psrlq $16, %mm2 # 00 00 vr vr vg vg 00 00
paddw %mm2, %mm7 # 00 00 vr vr *g *g ub ub
packsswb %mm7, %mm7 # pack signed saturated, and duplicate values (!)
/* 00 vr *g ub 00 vr *g ub
NB! This introduces saturation before the
end result is calculated. In strongly
saturated areas with high or low luminance
this is visible as a darkening resp.
brightening.
I doubt this is a real problem... The
only real solution is to keep these values
as 16 bits, and subtract at the end, which
unfortunately introduces extra cycles.
*/
pop %ebx
.endm
// load 2 Y values and add UV values
// Unfortunately, there is no 'duplicate byte into 4 mmx bytes' instruction
// In: %eax
.macro _DO_2Y_UV reg
mov %eax, %edx # dup AX register
shl $8, %eax # 00 Yx1 Yx0 00
ror $8, %edx # Yx0 00 00 Yx1
bswap %edx # Yx1 00 00 Yx0
or %edx, %eax # Yx1 Yx1 Yx0 Yx0
movd %eax, %mm3 # Load into MMX register
movq %mm3, %mm\reg # Double, and....
punpcklbw %mm3, %mm\reg # Poof! Yx1 Yx1 Yx1 Yx1 Yx0 Yx0 Yx0 Yx0
pand %mm5, %mm\reg # This isnt strictly necessary, but keeps the alpha byte at 0
psubb %mm6, %mm\reg # Turn into signed
paddsb %mm7, %mm\reg # add UV part (8 bytes!)
paddb %mm6, %mm\reg # Make unsigned again
.endm
.macro LOAD_4Y_ADD_UV
movzwl (%esi, %ebx), %eax # load Y10 & Y11, bits 0..15
_DO_2Y_UV 1 # stuff in MM1
xor %eax, %eax # clear
lodsw # load Y00 & Y01
_DO_2Y_UV 0 # stuff in MM0
.endm
.macro STORE_MMX32
// Section B2
movq %mm0, (%edi) # store 2 pixels at once at [dst]
movq %mm1, (%edi, %ebx, 4) # [dst + 4 * width] At moments like this,
# you must admire the Intel engineers :)
add $8, %edi
.endm
.macro _PUSH_EAX24 reg
stosw
shr $16, %eax
stosb
.endm
.macro _PUSH_MMX24 reg
movd %mm\reg, %eax # eax = x0
_PUSH_EAX24
psrlq $32, %mm\reg # pixel x1
movd %mm\reg, %eax
_PUSH_EAX24
.endm
.macro STORE_MMX24
# Blegh; this is more work.
push %edi
mov %ebx, %edx
shl $1, %edx
add %ebx, %edx # edx = 3 * ebx
add %edx, %edi # edi = edi + 3 * width
_PUSH_MMX24 1
pop %edi # restore edi
_PUSH_MMX24 0
.endm
.macro END_LOOP_32
# end of calculations
dec %ecx
jnz 1b # perform column loop
add %ebx, %esi # Done; go to next line
add %ebx, %edi
add %ebx, %edi
add %ebx, %edi
add %ebx, %edi
decl Height # decrement line counter
jnz 0b
.endm
.macro END_LOOP_24
# end of calculations
dec %ecx
jnz 1b # perform column loop
add %ebx, %esi # Done; go to next line
add %ebx, %edi
add %ebx, %edi
add %ebx, %edi
decl Height # decrement line counter
jnz 0b
.endm
.macro LEAVE_FUNC
emms # Clear MMX state
9: pop %edi
pop %esi
pop %ebx
leave
ret
.endm
/* Functions to go from YUV interlaced formats to RGB. Note that these
functions are build entirely from macros
*/
ENTRY(ccvt_420p_bgr32)
ENTER_FUNC
START_LOOP_YUV bgr
LOAD_MUL_UV 0
LOAD_4Y_ADD_UV
STORE_MMX32
END_LOOP_32
LEAVE_FUNC
ENTRY(ccvt_420p_bgr24)
ENTER_FUNC
START_LOOP_YUV bgr
LOAD_MUL_UV 0
LOAD_4Y_ADD_UV
STORE_MMX24
END_LOOP_24
LEAVE_FUNC
ENTRY(ccvt_420p_rgb32)
ENTER_FUNC
START_LOOP_YUV rgb
LOAD_MUL_UV 1
LOAD_4Y_ADD_UV
STORE_MMX32
END_LOOP_32
LEAVE_FUNC
ENTRY(ccvt_420p_rgb24)
ENTER_FUNC
START_LOOP_YUV rgb
LOAD_MUL_UV 1
LOAD_4Y_ADD_UV
STORE_MMX24
END_LOOP_24
LEAVE_FUNC