avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_h6_ssse3

Doubling the register width allowed to avoid a pshufb and a pmaddubsw.

Old benchmarks:
vp8_put_epel4_h6_c:                                    115.9 ( 1.00x)
vp8_put_epel4_h6_ssse3:                                 20.2 ( 5.74x)
vp8_put_epel4_h6v4_c:                                  276.3 ( 1.00x)
vp8_put_epel4_h6v4_ssse3:                               58.6 ( 4.71x)
vp8_put_epel4_h6v6_c:                                  363.6 ( 1.00x)
vp8_put_epel4_h6v6_ssse3:                               62.5 ( 5.82x)

New benchmarks:
vp8_put_epel4_h6_c:                                    116.4 ( 1.00x)
vp8_put_epel4_h6_ssse3:                                 16.0 ( 7.29x)
vp8_put_epel4_h6v4_c:                                  280.9 ( 1.00x)
vp8_put_epel4_h6v4_ssse3:                               44.3 ( 6.33x)
vp8_put_epel4_h6v6_c:                                  365.6 ( 1.00x)
vp8_put_epel4_h6v6_ssse3:                               53.1 ( 6.89x)

Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-11-24 16:11:10 +01:00
parent 3135bc0d3a
commit 99fb257f58

View File

@@ -33,6 +33,16 @@ fourtap_filter4_b_m: times 4 db -6, 123
times 4 db -1, 12
times 4 db 123, -6
sixtap_filter4_hb_m: times 8 db 2, -11
times 4 db 108, -8
times 4 db 36, 1
times 8 db 3, -16
times 4 db 77, -16
times 4 db 77, 3
times 8 db 1, -8
times 4 db 36, -11
times 4 db 108, 2
fourtap_filter_hb_m: times 8 db -6, 123
times 8 db 12, -1
times 8 db -9, 93
@@ -129,6 +139,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
%define fourtap_filter4_b picregq
%define sixtap_filter_hb picregq
%define sixtap_filter_b picregq
%define sixtap_filter4_hb picregq
%define fourtap_filter_v picregq
%define sixtap_filter_v picregq
%define bilinear_filter_vw picregq
@@ -140,6 +151,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
%define fourtap_filter4_b fourtap_filter4_b_m
%define sixtap_filter_hb sixtap_filter_hb_m
%define sixtap_filter_b sixtap_filter_b_m
%define sixtap_filter4_hb sixtap_filter4_hb_m
%define fourtap_filter_v fourtap_filter_v_m
%define sixtap_filter_v sixtap_filter_v_m
%define bilinear_filter_vw bilinear_filter_vw_m
@@ -148,6 +160,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
%endif
filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
filter4_h6_shuf: db 1, 3, 2, 4, 3, 5, 4, 6, 2, 4, 3, 5, 4, 6, 5, 7
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
@@ -180,7 +193,16 @@ SECTION .text
%define MOV movq
%endif
cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 6+2*(%1==8), dst, dststride, src, srcstride, height, mx, picreg
%if %1 == 4
mova m3, [filter4_h6_shuf]
%if PIC
lea picregq, [sixtap_filter4_hb_m]
%endif
shl mxd, 4
mova m4, [sixtap_filter4_hb+mxq-32]
mova m5, [sixtap_filter4_hb+mxq-16]
%else
lea mxd, [mxq*3]
mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3]
@@ -190,29 +212,35 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h
mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
mova m6, [sixtap_filter_hb+mxq*8-32]
mova m7, [sixtap_filter_hb+mxq*8-16]
%endif
.nextrow:
%if %1 == 4
; we need nine bytes, so two loads
movq m1, [srcq-1]
movq m0, [srcq-2]
punpcklbw m0, m1
pshufb m1, m3
pmaddubsw m1, m5
pmaddubsw m0, m4
movhlps m2, m1
%else
movu m0, [srcq-2]
mova m1, m0
mova m2, m0
%if mmsize == 8
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
punpcklbw m0, [srcq+3]
%else
pshufb m0, [filter_h6_shuf1]
%endif
pshufb m1, m3
pshufb m2, m4
pmaddubsw m0, m5
pmaddubsw m1, m6
pmaddubsw m2, m7
%endif
add srcq, srcstrideq
paddsw m0, m1
paddw m0, m1
paddsw m0, m2
pmulhrsw m0, [pw_256]
packuswb m0, m0
movh [dstq], m0 ; store
MOV [dstq], m0 ; store
; go to next line
add dstq, dststrideq
@@ -220,7 +248,6 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h
jg .nextrow
RET
INIT_XMM ssse3
cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, src, srcstride, height, mx, picreg
mova m2, [pw_256]
%if %1 == 8
@@ -405,9 +432,8 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
RET
%endmacro
INIT_MMX ssse3
FILTER_SSSE3 4
INIT_XMM ssse3
FILTER_SSSE3 4
FILTER_SSSE3 8
INIT_XMM sse2