mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-12-05 14:30:00 +01:00
avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_v4_ssse3
Switching to xmm registers allows to process two rows in parallel, leading to speedups. It is also ABI compliant (no more missing emms). Old benchmarks: vp8_put_epel4_v4_c: 96.8 ( 1.00x) vp8_put_epel4_v4_ssse3: 28.2 ( 3.43x) New benchmarks: vp8_put_epel4_v4_c: 95.1 ( 1.00x) vp8_put_epel4_v4_ssse3: 22.8 ( 4.17x) Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
@@ -238,6 +238,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
|
||||
jg .nextrow
|
||||
RET
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
||||
shl myd, 4
|
||||
%if PIC
|
||||
@@ -250,13 +251,38 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
|
||||
; read 3 lines
|
||||
mov picregq, srcstrideq
|
||||
neg picregq
|
||||
movh m0, [srcq+picregq]
|
||||
movh m1, [srcq]
|
||||
movh m2, [srcq+srcstrideq]
|
||||
MOV m0, [srcq+picregq]
|
||||
MOV m1, [srcq]
|
||||
MOV m2, [srcq+srcstrideq]
|
||||
lea srcq, [srcq+2*srcstrideq]
|
||||
punpcklbw m0, m2
|
||||
|
||||
%if %1 == 4
|
||||
.next2rows:
|
||||
movd m3, [srcq]
|
||||
movd m4, [srcq+srcstrideq]
|
||||
punpcklbw m1, m3
|
||||
punpcklqdq m0, m1
|
||||
punpcklbw m2, m4
|
||||
pmaddubsw m0, m5
|
||||
punpcklqdq m1, m2
|
||||
pmaddubsw m1, m6
|
||||
lea srcq, [srcq+2*srcstrideq]
|
||||
paddsw m1, m0
|
||||
pmulhrsw m1, m7
|
||||
mova m0, m2
|
||||
packuswb m1, m1
|
||||
movd [dstq], m1
|
||||
mova m2, m4
|
||||
psrldq m1, 4
|
||||
movd [dstq+dststrideq], m1
|
||||
mova m1, m3
|
||||
lea dstq, [dstq+2*dststrideq]
|
||||
sub heightd, 2
|
||||
jg .next2rows
|
||||
%else
|
||||
.nextrow:
|
||||
movh m3, [srcq+2*srcstrideq] ; read new row
|
||||
movh m3, [srcq] ; read new row
|
||||
pmaddubsw m0, m5
|
||||
punpcklbw m1, m3
|
||||
pmaddubsw m4, m1, m6
|
||||
@@ -273,9 +299,9 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
|
||||
add dstq, dststrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
%endif
|
||||
RET
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
||||
lea myd, [myq*3]
|
||||
%if PIC
|
||||
|
||||
Reference in New Issue
Block a user