From ed5e0f9c68c773d302fe766e9a924a17f293244d Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Sun, 23 Nov 2025 11:08:14 +0100 Subject: [PATCH] avcodec/x86/vp8dsp: Remove MMXEXT functions overridden by SSSE3 SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD), so that the overwhelming majority of our users (particularly those that actually update their FFmpeg) will be using the SSSE3 versions. This commit therefore removes the MMX(EXT) functions overridden by them (which don't abide by the ABI) to get closer to a removal of emms_c. Reviewed-by: Ronald S. Bultje Signed-off-by: Andreas Rheinhardt --- libavcodec/x86/vp8dsp.asm | 159 +---------------------------------- libavcodec/x86/vp8dsp_init.c | 37 +------- 2 files changed, 6 insertions(+), 190 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 231c21ea0d..7b836351e4 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -1,5 +1,5 @@ ;****************************************************************************** -;* VP8 MMXEXT optimizations +;* VP8 ASM optimizations ;* Copyright (c) 2010 Ronald S. Bultje ;* Copyright (c) 2010 Fiona Glaser ;* @@ -24,25 +24,6 @@ SECTION_RODATA -fourtap_filter_hw_m: times 4 dw -6, 123 - times 4 dw 12, -1 - times 4 dw -9, 93 - times 4 dw 50, -6 - times 4 dw -6, 50 - times 4 dw 93, -9 - times 4 dw -1, 12 - times 4 dw 123, -6 - -sixtap_filter_hw_m: times 4 dw 2, -11 - times 4 dw 108, 36 - times 4 dw -8, 1 - times 4 dw 3, -16 - times 4 dw 77, 77 - times 4 dw -16, 3 - times 4 dw 1, -8 - times 4 dw 36, 108 - times 4 dw -11, 2 - fourtap_filter_hb_m: times 8 db -6, 123 times 8 db 12, -1 times 8 db -9, 93 @@ -115,8 +96,6 @@ bilinear_filter_vb_m: times 8 db 7, 1 times 8 db 1, 7 %if PIC -%define fourtap_filter_hw picregq -%define sixtap_filter_hw picregq %define fourtap_filter_hb picregq %define sixtap_filter_hb picregq %define fourtap_filter_v picregq @@ -125,8 +104,6 @@ bilinear_filter_vb_m: times 8 db 7, 1 %define bilinear_filter_vb picregq %define npicregs 1 %else -%define fourtap_filter_hw fourtap_filter_hw_m -%define sixtap_filter_hw sixtap_filter_hw_m %define fourtap_filter_hb fourtap_filter_hb_m %define sixtap_filter_hb sixtap_filter_hb_m %define fourtap_filter_v fourtap_filter_v_m @@ -322,112 +299,6 @@ FILTER_SSSE3 4 INIT_XMM ssse3 FILTER_SSSE3 8 -; 4x4 block, H-only 4-tap filter -INIT_MMX mmxext -cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg - shl mxd, 4 -%if PIC - lea picregq, [fourtap_filter_hw_m] -%endif - movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words - movq mm5, [fourtap_filter_hw+mxq] - movq mm7, [pw_64] - pxor mm6, mm6 - -.nextrow: - movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels - - ; first set of 2 pixels - movq mm2, mm1 ; byte ABCD.. - punpcklbw mm1, mm6 ; byte->word ABCD - pshufw mm0, mm2, 9 ; byte CDEF.. - punpcklbw mm0, mm6 ; byte->word CDEF - pshufw mm3, mm1, 0x94 ; word ABBC - pshufw mm1, mm0, 0x94 ; word CDDE - pmaddwd mm3, mm4 ; multiply 2px with F0/F1 - movq mm0, mm1 ; backup for second set of pixels - pmaddwd mm1, mm5 ; multiply 2px with F2/F3 - paddd mm3, mm1 ; finish 1st 2px - - ; second set of 2 pixels, use backup of above - punpckhbw mm2, mm6 ; byte->word EFGH - pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 - pshufw mm1, mm2, 0x94 ; word EFFG - pmaddwd mm1, mm5 ; multiply 2px with F2/F3 - paddd mm0, mm1 ; finish 2nd 2px - - ; merge two sets of 2 pixels into one set of 4, round/clip/store - packssdw mm3, mm0 ; merge dword->word (4px) - paddsw mm3, mm7 ; rounding - psraw mm3, 7 - packuswb mm3, mm6 ; clip and word->bytes - movd [dstq], mm3 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - RET - -; 4x4 block, H-only 6-tap filter -INIT_MMX mmxext -cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg - lea mxd, [mxq*3] -%if PIC - lea picregq, [sixtap_filter_hw_m] -%endif - movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words - movq mm5, [sixtap_filter_hw+mxq*8-32] - movq mm6, [sixtap_filter_hw+mxq*8-16] - movq mm7, [pw_64] - pxor mm3, mm3 - -.nextrow: - movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels - - ; first set of 2 pixels - movq mm2, mm1 ; byte ABCD.. - punpcklbw mm1, mm3 ; byte->word ABCD - pshufw mm0, mm2, 0x9 ; byte CDEF.. - punpckhbw mm2, mm3 ; byte->word EFGH - punpcklbw mm0, mm3 ; byte->word CDEF - pshufw mm1, mm1, 0x94 ; word ABBC - pshufw mm2, mm2, 0x94 ; word EFFG - pmaddwd mm1, mm4 ; multiply 2px with F0/F1 - pshufw mm3, mm0, 0x94 ; word CDDE - movq mm0, mm3 ; backup for second set of pixels - pmaddwd mm3, mm5 ; multiply 2px with F2/F3 - paddd mm1, mm3 ; add to 1st 2px cache - movq mm3, mm2 ; backup for second set of pixels - pmaddwd mm2, mm6 ; multiply 2px with F4/F5 - paddd mm1, mm2 ; finish 1st 2px - - ; second set of 2 pixels, use backup of above - movd mm2, [srcq+3] ; byte FGHI (prevent overreads) - pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 - pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 - paddd mm0, mm3 ; add to 2nd 2px cache - pxor mm3, mm3 - punpcklbw mm2, mm3 ; byte->word FGHI - pshufw mm2, mm2, 0xE9 ; word GHHI - pmaddwd mm2, mm6 ; multiply 2px with F4/F5 - paddd mm0, mm2 ; finish 2nd 2px - - ; merge two sets of 2 pixels into one set of 4, round/clip/store - packssdw mm1, mm0 ; merge dword->word (4px) - paddsw mm1, mm7 ; rounding - psraw mm1, 7 - packuswb mm1, mm3 ; clip and word->bytes - movd [dstq], mm1 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - RET - INIT_XMM sse2 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg shl mxd, 5 @@ -539,9 +410,9 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h jg .nextrow RET -%macro FILTER_V 1 +INIT_XMM sse2 ; 4x4 block, V-only 4-tap filter -cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my +cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my shl myd, 5 %if PIC lea picregq, [fourtap_filter_v_m] @@ -594,7 +465,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr ; 4x4 block, V-only 6-tap filter -cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my +cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my shl myd, 4 lea myq, [myq*3] %if PIC @@ -656,12 +527,6 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr dec heightd ; next row jg .nextrow RET -%endmacro - -INIT_MMX mmxext -FILTER_V 4 -INIT_XMM sse2 -FILTER_V 8 %macro FILTER_BILINEAR 1 %if cpuflag(ssse3) @@ -722,16 +587,9 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%if mmsize == 8 - packuswb m0, m0 - packuswb m2, m2 - movh [dstq+dststrideq*0], m0 - movh [dstq+dststrideq*1], m2 -%else packuswb m0, m2 movh [dstq+dststrideq*0], m0 movhps [dstq+dststrideq*1], m0 -%endif %endif ; cpuflag(ssse3) lea dstq, [dstq+dststrideq*2] @@ -799,16 +657,9 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%if mmsize == 8 - packuswb m0, m0 - packuswb m2, m2 - movh [dstq+dststrideq*0], m0 - movh [dstq+dststrideq*1], m2 -%else packuswb m0, m2 movh [dstq+dststrideq*0], m0 movhps [dstq+dststrideq*1], m0 -%endif %endif ; cpuflag(ssse3) lea dstq, [dstq+dststrideq*2] @@ -818,8 +669,6 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride RET %endmacro -INIT_MMX mmxext -FILTER_BILINEAR 4 INIT_XMM sse2 FILTER_BILINEAR 8 INIT_MMX ssse3 diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c index e37afab775..00733a2564 100644 --- a/libavcodec/x86/vp8dsp_init.c +++ b/libavcodec/x86/vp8dsp_init.c @@ -29,19 +29,6 @@ /* * MC functions */ -void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -80,9 +67,6 @@ void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -93,9 +77,6 @@ void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -159,14 +140,6 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT dst, dststride, tmpptr, SIZE, height, mx, my); \ } -#define HVTAPMMX(x, y) \ -HVTAP(mmxext, 8, x, y, 4, 8) - -HVTAPMMX(4, 4) -HVTAPMMX(4, 6) -HVTAPMMX(6, 4) -HVTAPMMX(6, 6) - #define HVTAPSSE2(x, y, w) \ HVTAP(sse2, 16, x, y, w, 16) \ HVTAP(ssse3, 16, x, y, w, 16) @@ -194,7 +167,6 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ dst, dststride, tmp, SIZE, height, mx, my); \ } -HVBILIN(mmxext, 8, 4, 8) HVBILIN(sse2, 8, 8, 16) HVBILIN(sse2, 8, 16, 16) HVBILIN(ssse3, 8, 4, 8) @@ -285,13 +257,6 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; } - /* note that 4-tap width=16 functions are missing because w=16 - * is only used for luma, and luma is always a copy or sixtap. */ - if (EXTERNAL_MMXEXT(cpu_flags)) { - VP8_MC_FUNC(2, 4, mmxext); - VP8_BILINEAR_MC_FUNC(2, 4, mmxext); - } - if (EXTERNAL_SSE(cpu_flags)) { c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; @@ -304,6 +269,8 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) VP8_BILINEAR_MC_FUNC(1, 8, sse2); } + /* note that 4-tap width=16 functions are missing because w=16 + * is only used for luma, and luma is always a copy or sixtap. */ if (EXTERNAL_SSSE3(cpu_flags)) { VP8_LUMA_MC_FUNC(0, 16, ssse3); VP8_MC_FUNC(1, 8, ssse3);