avcodec/x86/vp8dsp: Remove MMXEXT functions overridden by SSSE3

SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
so that the overwhelming majority of our users (particularly those
that actually update their FFmpeg) will be using the SSSE3 versions.
This commit therefore removes the MMX(EXT) functions overridden
by them (which don't abide by the ABI) to get closer to a removal
of emms_c.

Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-11-23 11:08:14 +01:00
parent 9b14ea0aa1
commit ed5e0f9c68
2 changed files with 6 additions and 190 deletions

View File

@@ -1,5 +1,5 @@
;******************************************************************************
;* VP8 MMXEXT optimizations
;* VP8 ASM optimizations
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
;*
@@ -24,25 +24,6 @@
SECTION_RODATA
fourtap_filter_hw_m: times 4 dw -6, 123
times 4 dw 12, -1
times 4 dw -9, 93
times 4 dw 50, -6
times 4 dw -6, 50
times 4 dw 93, -9
times 4 dw -1, 12
times 4 dw 123, -6
sixtap_filter_hw_m: times 4 dw 2, -11
times 4 dw 108, 36
times 4 dw -8, 1
times 4 dw 3, -16
times 4 dw 77, 77
times 4 dw -16, 3
times 4 dw 1, -8
times 4 dw 36, 108
times 4 dw -11, 2
fourtap_filter_hb_m: times 8 db -6, 123
times 8 db 12, -1
times 8 db -9, 93
@@ -115,8 +96,6 @@ bilinear_filter_vb_m: times 8 db 7, 1
times 8 db 1, 7
%if PIC
%define fourtap_filter_hw picregq
%define sixtap_filter_hw picregq
%define fourtap_filter_hb picregq
%define sixtap_filter_hb picregq
%define fourtap_filter_v picregq
@@ -125,8 +104,6 @@ bilinear_filter_vb_m: times 8 db 7, 1
%define bilinear_filter_vb picregq
%define npicregs 1
%else
%define fourtap_filter_hw fourtap_filter_hw_m
%define sixtap_filter_hw sixtap_filter_hw_m
%define fourtap_filter_hb fourtap_filter_hb_m
%define sixtap_filter_hb sixtap_filter_hb_m
%define fourtap_filter_v fourtap_filter_v_m
@@ -322,112 +299,6 @@ FILTER_SSSE3 4
INIT_XMM ssse3
FILTER_SSSE3 8
; 4x4 block, H-only 4-tap filter
INIT_MMX mmxext
cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 4
%if PIC
lea picregq, [fourtap_filter_hw_m]
%endif
movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
movq mm5, [fourtap_filter_hw+mxq]
movq mm7, [pw_64]
pxor mm6, mm6
.nextrow:
movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
; first set of 2 pixels
movq mm2, mm1 ; byte ABCD..
punpcklbw mm1, mm6 ; byte->word ABCD
pshufw mm0, mm2, 9 ; byte CDEF..
punpcklbw mm0, mm6 ; byte->word CDEF
pshufw mm3, mm1, 0x94 ; word ABBC
pshufw mm1, mm0, 0x94 ; word CDDE
pmaddwd mm3, mm4 ; multiply 2px with F0/F1
movq mm0, mm1 ; backup for second set of pixels
pmaddwd mm1, mm5 ; multiply 2px with F2/F3
paddd mm3, mm1 ; finish 1st 2px
; second set of 2 pixels, use backup of above
punpckhbw mm2, mm6 ; byte->word EFGH
pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
pshufw mm1, mm2, 0x94 ; word EFFG
pmaddwd mm1, mm5 ; multiply 2px with F2/F3
paddd mm0, mm1 ; finish 2nd 2px
; merge two sets of 2 pixels into one set of 4, round/clip/store
packssdw mm3, mm0 ; merge dword->word (4px)
paddsw mm3, mm7 ; rounding
psraw mm3, 7
packuswb mm3, mm6 ; clip and word->bytes
movd [dstq], mm3 ; store
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
RET
; 4x4 block, H-only 6-tap filter
INIT_MMX mmxext
cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
lea mxd, [mxq*3]
%if PIC
lea picregq, [sixtap_filter_hw_m]
%endif
movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
movq mm5, [sixtap_filter_hw+mxq*8-32]
movq mm6, [sixtap_filter_hw+mxq*8-16]
movq mm7, [pw_64]
pxor mm3, mm3
.nextrow:
movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
; first set of 2 pixels
movq mm2, mm1 ; byte ABCD..
punpcklbw mm1, mm3 ; byte->word ABCD
pshufw mm0, mm2, 0x9 ; byte CDEF..
punpckhbw mm2, mm3 ; byte->word EFGH
punpcklbw mm0, mm3 ; byte->word CDEF
pshufw mm1, mm1, 0x94 ; word ABBC
pshufw mm2, mm2, 0x94 ; word EFFG
pmaddwd mm1, mm4 ; multiply 2px with F0/F1
pshufw mm3, mm0, 0x94 ; word CDDE
movq mm0, mm3 ; backup for second set of pixels
pmaddwd mm3, mm5 ; multiply 2px with F2/F3
paddd mm1, mm3 ; add to 1st 2px cache
movq mm3, mm2 ; backup for second set of pixels
pmaddwd mm2, mm6 ; multiply 2px with F4/F5
paddd mm1, mm2 ; finish 1st 2px
; second set of 2 pixels, use backup of above
movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
paddd mm0, mm3 ; add to 2nd 2px cache
pxor mm3, mm3
punpcklbw mm2, mm3 ; byte->word FGHI
pshufw mm2, mm2, 0xE9 ; word GHHI
pmaddwd mm2, mm6 ; multiply 2px with F4/F5
paddd mm0, mm2 ; finish 2nd 2px
; merge two sets of 2 pixels into one set of 4, round/clip/store
packssdw mm1, mm0 ; merge dword->word (4px)
paddsw mm1, mm7 ; rounding
psraw mm1, 7
packuswb mm1, mm3 ; clip and word->bytes
movd [dstq], mm1 ; store
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
RET
INIT_XMM sse2
cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 5
@@ -539,9 +410,9 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h
jg .nextrow
RET
%macro FILTER_V 1
INIT_XMM sse2
; 4x4 block, V-only 4-tap filter
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 5
%if PIC
lea picregq, [fourtap_filter_v_m]
@@ -594,7 +465,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
; 4x4 block, V-only 6-tap filter
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
lea myq, [myq*3]
%if PIC
@@ -656,12 +527,6 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
dec heightd ; next row
jg .nextrow
RET
%endmacro
INIT_MMX mmxext
FILTER_V 4
INIT_XMM sse2
FILTER_V 8
%macro FILTER_BILINEAR 1
%if cpuflag(ssse3)
@@ -722,16 +587,9 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
%if mmsize == 8
packuswb m0, m0
packuswb m2, m2
movh [dstq+dststrideq*0], m0
movh [dstq+dststrideq*1], m2
%else
packuswb m0, m2
movh [dstq+dststrideq*0], m0
movhps [dstq+dststrideq*1], m0
%endif
%endif ; cpuflag(ssse3)
lea dstq, [dstq+dststrideq*2]
@@ -799,16 +657,9 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
%if mmsize == 8
packuswb m0, m0
packuswb m2, m2
movh [dstq+dststrideq*0], m0
movh [dstq+dststrideq*1], m2
%else
packuswb m0, m2
movh [dstq+dststrideq*0], m0
movhps [dstq+dststrideq*1], m0
%endif
%endif ; cpuflag(ssse3)
lea dstq, [dstq+dststrideq*2]
@@ -818,8 +669,6 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
RET
%endmacro
INIT_MMX mmxext
FILTER_BILINEAR 4
INIT_XMM sse2
FILTER_BILINEAR 8
INIT_MMX ssse3

View File

@@ -29,19 +29,6 @@
/*
* MC functions
*/
void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
@@ -80,9 +67,6 @@ void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
@@ -93,9 +77,6 @@ void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
const uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
@@ -159,14 +140,6 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
dst, dststride, tmpptr, SIZE, height, mx, my); \
}
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8)
HVTAPMMX(4, 4)
HVTAPMMX(4, 6)
HVTAPMMX(6, 4)
HVTAPMMX(6, 6)
#define HVTAPSSE2(x, y, w) \
HVTAP(sse2, 16, x, y, w, 16) \
HVTAP(ssse3, 16, x, y, w, 16)
@@ -194,7 +167,6 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
dst, dststride, tmp, SIZE, height, mx, my); \
}
HVBILIN(mmxext, 8, 4, 8)
HVBILIN(sse2, 8, 8, 16)
HVBILIN(sse2, 8, 16, 16)
HVBILIN(ssse3, 8, 4, 8)
@@ -285,13 +257,6 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
}
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
if (EXTERNAL_MMXEXT(cpu_flags)) {
VP8_MC_FUNC(2, 4, mmxext);
VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
}
if (EXTERNAL_SSE(cpu_flags)) {
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
@@ -304,6 +269,8 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
}
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
if (EXTERNAL_SSSE3(cpu_flags)) {
VP8_LUMA_MC_FUNC(0, 16, ssse3);
VP8_MC_FUNC(1, 8, ssse3);