diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 231c21ea0d..7b836351e4 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -1,5 +1,5 @@ ;****************************************************************************** -;* VP8 MMXEXT optimizations +;* VP8 ASM optimizations ;* Copyright (c) 2010 Ronald S. Bultje ;* Copyright (c) 2010 Fiona Glaser ;* @@ -24,25 +24,6 @@ SECTION_RODATA -fourtap_filter_hw_m: times 4 dw -6, 123 - times 4 dw 12, -1 - times 4 dw -9, 93 - times 4 dw 50, -6 - times 4 dw -6, 50 - times 4 dw 93, -9 - times 4 dw -1, 12 - times 4 dw 123, -6 - -sixtap_filter_hw_m: times 4 dw 2, -11 - times 4 dw 108, 36 - times 4 dw -8, 1 - times 4 dw 3, -16 - times 4 dw 77, 77 - times 4 dw -16, 3 - times 4 dw 1, -8 - times 4 dw 36, 108 - times 4 dw -11, 2 - fourtap_filter_hb_m: times 8 db -6, 123 times 8 db 12, -1 times 8 db -9, 93 @@ -115,8 +96,6 @@ bilinear_filter_vb_m: times 8 db 7, 1 times 8 db 1, 7 %if PIC -%define fourtap_filter_hw picregq -%define sixtap_filter_hw picregq %define fourtap_filter_hb picregq %define sixtap_filter_hb picregq %define fourtap_filter_v picregq @@ -125,8 +104,6 @@ bilinear_filter_vb_m: times 8 db 7, 1 %define bilinear_filter_vb picregq %define npicregs 1 %else -%define fourtap_filter_hw fourtap_filter_hw_m -%define sixtap_filter_hw sixtap_filter_hw_m %define fourtap_filter_hb fourtap_filter_hb_m %define sixtap_filter_hb sixtap_filter_hb_m %define fourtap_filter_v fourtap_filter_v_m @@ -322,112 +299,6 @@ FILTER_SSSE3 4 INIT_XMM ssse3 FILTER_SSSE3 8 -; 4x4 block, H-only 4-tap filter -INIT_MMX mmxext -cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg - shl mxd, 4 -%if PIC - lea picregq, [fourtap_filter_hw_m] -%endif - movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words - movq mm5, [fourtap_filter_hw+mxq] - movq mm7, [pw_64] - pxor mm6, mm6 - -.nextrow: - movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels - - ; first set of 2 pixels - movq mm2, mm1 ; byte ABCD.. - punpcklbw mm1, mm6 ; byte->word ABCD - pshufw mm0, mm2, 9 ; byte CDEF.. - punpcklbw mm0, mm6 ; byte->word CDEF - pshufw mm3, mm1, 0x94 ; word ABBC - pshufw mm1, mm0, 0x94 ; word CDDE - pmaddwd mm3, mm4 ; multiply 2px with F0/F1 - movq mm0, mm1 ; backup for second set of pixels - pmaddwd mm1, mm5 ; multiply 2px with F2/F3 - paddd mm3, mm1 ; finish 1st 2px - - ; second set of 2 pixels, use backup of above - punpckhbw mm2, mm6 ; byte->word EFGH - pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 - pshufw mm1, mm2, 0x94 ; word EFFG - pmaddwd mm1, mm5 ; multiply 2px with F2/F3 - paddd mm0, mm1 ; finish 2nd 2px - - ; merge two sets of 2 pixels into one set of 4, round/clip/store - packssdw mm3, mm0 ; merge dword->word (4px) - paddsw mm3, mm7 ; rounding - psraw mm3, 7 - packuswb mm3, mm6 ; clip and word->bytes - movd [dstq], mm3 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - RET - -; 4x4 block, H-only 6-tap filter -INIT_MMX mmxext -cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg - lea mxd, [mxq*3] -%if PIC - lea picregq, [sixtap_filter_hw_m] -%endif - movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words - movq mm5, [sixtap_filter_hw+mxq*8-32] - movq mm6, [sixtap_filter_hw+mxq*8-16] - movq mm7, [pw_64] - pxor mm3, mm3 - -.nextrow: - movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels - - ; first set of 2 pixels - movq mm2, mm1 ; byte ABCD.. - punpcklbw mm1, mm3 ; byte->word ABCD - pshufw mm0, mm2, 0x9 ; byte CDEF.. - punpckhbw mm2, mm3 ; byte->word EFGH - punpcklbw mm0, mm3 ; byte->word CDEF - pshufw mm1, mm1, 0x94 ; word ABBC - pshufw mm2, mm2, 0x94 ; word EFFG - pmaddwd mm1, mm4 ; multiply 2px with F0/F1 - pshufw mm3, mm0, 0x94 ; word CDDE - movq mm0, mm3 ; backup for second set of pixels - pmaddwd mm3, mm5 ; multiply 2px with F2/F3 - paddd mm1, mm3 ; add to 1st 2px cache - movq mm3, mm2 ; backup for second set of pixels - pmaddwd mm2, mm6 ; multiply 2px with F4/F5 - paddd mm1, mm2 ; finish 1st 2px - - ; second set of 2 pixels, use backup of above - movd mm2, [srcq+3] ; byte FGHI (prevent overreads) - pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 - pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 - paddd mm0, mm3 ; add to 2nd 2px cache - pxor mm3, mm3 - punpcklbw mm2, mm3 ; byte->word FGHI - pshufw mm2, mm2, 0xE9 ; word GHHI - pmaddwd mm2, mm6 ; multiply 2px with F4/F5 - paddd mm0, mm2 ; finish 2nd 2px - - ; merge two sets of 2 pixels into one set of 4, round/clip/store - packssdw mm1, mm0 ; merge dword->word (4px) - paddsw mm1, mm7 ; rounding - psraw mm1, 7 - packuswb mm1, mm3 ; clip and word->bytes - movd [dstq], mm1 ; store - - ; go to next line - add dstq, dststrideq - add srcq, srcstrideq - dec heightd ; next row - jg .nextrow - RET - INIT_XMM sse2 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg shl mxd, 5 @@ -539,9 +410,9 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h jg .nextrow RET -%macro FILTER_V 1 +INIT_XMM sse2 ; 4x4 block, V-only 4-tap filter -cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my +cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my shl myd, 5 %if PIC lea picregq, [fourtap_filter_v_m] @@ -594,7 +465,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr ; 4x4 block, V-only 6-tap filter -cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my +cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my shl myd, 4 lea myq, [myq*3] %if PIC @@ -656,12 +527,6 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr dec heightd ; next row jg .nextrow RET -%endmacro - -INIT_MMX mmxext -FILTER_V 4 -INIT_XMM sse2 -FILTER_V 8 %macro FILTER_BILINEAR 1 %if cpuflag(ssse3) @@ -722,16 +587,9 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%if mmsize == 8 - packuswb m0, m0 - packuswb m2, m2 - movh [dstq+dststrideq*0], m0 - movh [dstq+dststrideq*1], m2 -%else packuswb m0, m2 movh [dstq+dststrideq*0], m0 movhps [dstq+dststrideq*1], m0 -%endif %endif ; cpuflag(ssse3) lea dstq, [dstq+dststrideq*2] @@ -799,16 +657,9 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%if mmsize == 8 - packuswb m0, m0 - packuswb m2, m2 - movh [dstq+dststrideq*0], m0 - movh [dstq+dststrideq*1], m2 -%else packuswb m0, m2 movh [dstq+dststrideq*0], m0 movhps [dstq+dststrideq*1], m0 -%endif %endif ; cpuflag(ssse3) lea dstq, [dstq+dststrideq*2] @@ -818,8 +669,6 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride RET %endmacro -INIT_MMX mmxext -FILTER_BILINEAR 4 INIT_XMM sse2 FILTER_BILINEAR 8 INIT_MMX ssse3 diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c index e37afab775..00733a2564 100644 --- a/libavcodec/x86/vp8dsp_init.c +++ b/libavcodec/x86/vp8dsp_init.c @@ -29,19 +29,6 @@ /* * MC functions */ -void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); -void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); - void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -80,9 +67,6 @@ void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -93,9 +77,6 @@ void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); -void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, - const uint8_t *src, ptrdiff_t srcstride, - int height, int mx, int my); void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my); @@ -159,14 +140,6 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT dst, dststride, tmpptr, SIZE, height, mx, my); \ } -#define HVTAPMMX(x, y) \ -HVTAP(mmxext, 8, x, y, 4, 8) - -HVTAPMMX(4, 4) -HVTAPMMX(4, 6) -HVTAPMMX(6, 4) -HVTAPMMX(6, 6) - #define HVTAPSSE2(x, y, w) \ HVTAP(sse2, 16, x, y, w, 16) \ HVTAP(ssse3, 16, x, y, w, 16) @@ -194,7 +167,6 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ dst, dststride, tmp, SIZE, height, mx, my); \ } -HVBILIN(mmxext, 8, 4, 8) HVBILIN(sse2, 8, 8, 16) HVBILIN(sse2, 8, 16, 16) HVBILIN(ssse3, 8, 4, 8) @@ -285,13 +257,6 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; } - /* note that 4-tap width=16 functions are missing because w=16 - * is only used for luma, and luma is always a copy or sixtap. */ - if (EXTERNAL_MMXEXT(cpu_flags)) { - VP8_MC_FUNC(2, 4, mmxext); - VP8_BILINEAR_MC_FUNC(2, 4, mmxext); - } - if (EXTERNAL_SSE(cpu_flags)) { c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; @@ -304,6 +269,8 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) VP8_BILINEAR_MC_FUNC(1, 8, sse2); } + /* note that 4-tap width=16 functions are missing because w=16 + * is only used for luma, and luma is always a copy or sixtap. */ if (EXTERNAL_SSSE3(cpu_flags)) { VP8_LUMA_MC_FUNC(0, 16, ssse3); VP8_MC_FUNC(1, 8, ssse3);