avfilter/f_sidedata: also handle global side data in filter links

Should fix issue #21071 Signed-off-by: James Almer <jamrial@gmail.com>
avformat/movenc: Fix leak of IAMFContext on error
2025-12-05 14:30:00 +01:00 · 2025-12-04 13:50:45 -03:00 · 2025-12-04 16:15:09 +00:00 · 2025-12-04 16:53:58 +01:00 · 2025-12-04 15:17:37 +01:00 · 2025-12-04 15:17:37 +01:00
11 changed files with 336 additions and 357 deletions
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -90,27 +90,22 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
            c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_rvv;
            c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_rvv;
            c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_rvv;
-            c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
            c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
            c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;

            c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_rvv;
            c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_rvv;
            c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_rvv;
-            c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
            c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
            c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
 #if __riscv_xlen <= 64
            c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
            c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
            c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
-            c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
            c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
            c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
-            c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
            c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
            c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
-            c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
            c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
            c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
 #endif
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -537,7 +537,14 @@ func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x, zba
 endfunc
 .endm

-.irp len,16,8,4
+# Only the sixtaps versions are used for epel16.
+epel 16 6 h
+epel 16 6 v
+#if __riscv_xlen <= 64
+epel_hv 16 6 6
+#endif
+
+.irp len,8,4
 epel \len 6 h
 epel \len 4 h
 epel \len 6 v
--- a/libavcodec/vp8dsp.c
+++ b/libavcodec/vp8dsp.c
@@ -558,26 +558,21 @@ put_vp8_epel ## SIZE ## _h ## HTAPS ## v ## VTAPS ## _c(uint8_t *dst,         \
    }                                                                         \
 }

-VP8_EPEL_H(16, 4)
 VP8_EPEL_H(8,  4)
 VP8_EPEL_H(4,  4)
 VP8_EPEL_H(16, 6)
 VP8_EPEL_H(8,  6)
 VP8_EPEL_H(4,  6)
-VP8_EPEL_V(16, 4)
 VP8_EPEL_V(8,  4)
 VP8_EPEL_V(4,  4)
 VP8_EPEL_V(16, 6)
 VP8_EPEL_V(8,  6)
 VP8_EPEL_V(4,  6)

-VP8_EPEL_HV(16, 4, 4)
 VP8_EPEL_HV(8,  4, 4)
 VP8_EPEL_HV(4,  4, 4)
-VP8_EPEL_HV(16, 4, 6)
 VP8_EPEL_HV(8,  4, 6)
 VP8_EPEL_HV(4,  4, 6)
-VP8_EPEL_HV(16, 6, 4)
 VP8_EPEL_HV(8,  6, 4)
 VP8_EPEL_HV(4,  6, 4)
 VP8_EPEL_HV(16, 6, 6)
@@ -667,7 +662,11 @@ VP8_BILINEAR(4)

 av_cold void ff_vp78dsp_init(VP8DSPContext *dsp)
 {
-    VP78_MC_FUNC(0, 16);
+    dsp->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_c;
+    dsp->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_c;
+    dsp->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_c;
+    dsp->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_c;
+
    VP78_MC_FUNC(1, 8);
    VP78_MC_FUNC(2, 4);

--- a/libavcodec/vulkan/dpx_unpack.comp
+++ b/libavcodec/vulkan/dpx_unpack.comp
@@ -44,8 +44,11 @@ i16vec4 parse_packed_in_32(ivec2 pos, int stride)
 #else
 i16vec4 parse_packed_in_32(ivec2 pos, int stride)
 {
-    uint line_off = pos.y*(stride*BITS_PER_COMP*COMPONENTS +
-                           (need_align << 3));
+    uint line_size = stride*BITS_PER_COMP*COMPONENTS;
+    line_size += line_size & 31;
+    line_size += need_align << 3;
+
+    uint line_off = pos.y*line_size;
    uint pix_off = pos.x*BITS_PER_COMP*COMPONENTS;

    uint off = (line_off + pix_off >> 5);
--- a/libavcodec/vulkan/ffv1_common.comp
+++ b/libavcodec/vulkan/ffv1_common.comp
@@ -124,8 +124,12 @@ ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off,
        }
        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];

+#if RGB_LINECACHE == 2
        /* top-2 became current upon swap */
        TYPE top2 = TYPE(imageLoad(pred, sp + LADDR(off))[comp]);
+#else
+        TYPE top2 = TYPE(imageLoad(pred, sp + LADDR(off + ivec2(0, -2)))[comp]);
+#endif
        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
    }

--- a/libavcodec/vulkan_dpx.c
+++ b/libavcodec/vulkan_dpx.c
@@ -402,9 +402,7 @@ static int vk_decode_dpx_init(AVCodecContext *avctx)

    switch (dpx->pix_fmt) {
    case AV_PIX_FMT_GRAY10:
-    case AV_PIX_FMT_GRAY12:
    case AV_PIX_FMT_GBRAP10:
-    case AV_PIX_FMT_GBRAP12:
    case AV_PIX_FMT_UYVY422:
    case AV_PIX_FMT_YUV444P:
    case AV_PIX_FMT_YUVA444P:
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -1,5 +1,5 @@
 ;******************************************************************************
-;* VP8 MMXEXT optimizations
+;* VP8 ASM optimizations
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;*
@@ -24,43 +24,43 @@

 SECTION_RODATA

-fourtap_filter_hw_m: times 4 dw  -6, 123
-                     times 4 dw  12,  -1
-                     times 4 dw  -9,  93
-                     times 4 dw  50,  -6
-                     times 4 dw  -6,  50
-                     times 4 dw  93,  -9
-                     times 4 dw  -1,  12
-                     times 4 dw 123,  -6
+fourtap_filter4_b_m: times 4 db  -6, 123
+                     times 4 db  12,  -1
+                     times 4 db  -9,  93
+                     times 4 db  50,  -6
+                     times 4 db  -6,  50
+                     times 4 db  93,  -9
+                     times 4 db  -1,  12
+                     times 4 db 123,  -6

-sixtap_filter_hw_m:  times 4 dw   2, -11
-                     times 4 dw 108,  36
-                     times 4 dw  -8,   1
-                     times 4 dw   3, -16
-                     times 4 dw  77,  77
-                     times 4 dw -16,   3
-                     times 4 dw   1,  -8
-                     times 4 dw  36, 108
-                     times 4 dw -11,   2
+sixtap_filter4_hb_m: times 8 db   2, -11
+                     times 4 db 108,  -8
+                     times 4 db  36,   1
+                     times 8 db   3, -16
+                     times 4 db  77, -16
+                     times 4 db  77,   3
+                     times 8 db   1,  -8
+                     times 4 db  36, -11
+                     times 4 db 108,   2

-fourtap_filter_hb_m: times 8 db  -6, 123
-                     times 8 db  12,  -1
-                     times 8 db  -9,  93
-                     times 8 db  50,  -6
-                     times 8 db  -6,  50
-                     times 8 db  93,  -9
-                     times 8 db  -1,  12
-                     times 8 db 123,  -6
+fourtap_filter_b_m:  times 8 db  -6,  12
+                     times 8 db 123,  -1
+                     times 8 db  -9,  50
+                     times 8 db  93,  -6
+                     times 8 db  -6,  93
+                     times 8 db  50,  -9
+                     times 8 db  -1, 123
+                     times 8 db  12,  -6

-sixtap_filter_hb_m:  times 8 db   2,   1
-                     times 8 db -11, 108
-                     times 8 db  36,  -8
-                     times 8 db   3,   3
-                     times 8 db -16,  77
-                     times 8 db  77, -16
-                     times 8 db   1,   2
-                     times 8 db  -8,  36
-                     times 8 db 108, -11
+sixtap_filter_b_m:   times 8 db   2,  36
+                     times 8 db -11,  -8
+                     times 8 db 108,   1
+                     times 8 db   3,  77
+                     times 8 db -16, -16
+                     times 8 db  77,   3
+                     times 8 db   1, 108
+                     times 8 db  -8, -11
+                     times 8 db  36,   2

 fourtap_filter_v_m:  times 8 dw  -6
                     times 8 dw 123
@@ -115,20 +115,20 @@ bilinear_filter_vb_m: times 8 db 7, 1
                      times 8 db 1, 7

 %if PIC
-%define fourtap_filter_hw  picregq
-%define sixtap_filter_hw   picregq
-%define fourtap_filter_hb  picregq
-%define sixtap_filter_hb   picregq
+%define fourtap_filter_b   picregq
+%define fourtap_filter4_b  picregq
+%define sixtap_filter_b    picregq
+%define sixtap_filter4_hb  picregq
 %define fourtap_filter_v   picregq
 %define sixtap_filter_v    picregq
 %define bilinear_filter_vw picregq
 %define bilinear_filter_vb picregq
 %define npicregs 1
 %else
-%define fourtap_filter_hw  fourtap_filter_hw_m
-%define sixtap_filter_hw   sixtap_filter_hw_m
-%define fourtap_filter_hb  fourtap_filter_hb_m
-%define sixtap_filter_hb   sixtap_filter_hb_m
+%define fourtap_filter_b   fourtap_filter_b_m
+%define fourtap_filter4_b  fourtap_filter4_b_m
+%define sixtap_filter_b    sixtap_filter_b_m
+%define sixtap_filter4_hb  sixtap_filter4_hb_m
 %define fourtap_filter_v   fourtap_filter_v_m
 %define sixtap_filter_v    sixtap_filter_v_m
 %define bilinear_filter_vw bilinear_filter_vw_m
@@ -136,12 +136,17 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %define npicregs 0
 %endif

-filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
-filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
+filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3,  4, 4,  5, 5,  6
+filter4_h6_shuf: db 1, 3, 2, 4, 3, 5, 4, 6, 2, 4, 3,  5, 4,  6, 5,  7

-filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
-filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
-filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
+filter_h4_shuf1: db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5,  7, 6,  8, 7,  9
+filter_h4_shuf2: db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6,  8, 7,  9, 8, 10
+
+filter_h6_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5,  8, 6,  9, 7, 10
+filter_h6_shuf2: db 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6,  9, 7, 10, 8, 11
+filter_h6_shuf3: db 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 9, 12
+
+filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7, 7,  8

 pw_20091: times 4 dw 20091
 pw_17734: times 4 dw 17734
@@ -162,72 +167,109 @@ SECTION .text
 ;-------------------------------------------------------------------------------

 %macro FILTER_SSSE3 1
-cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
+%if %1 == 4
+%define MOV movd
+%else
+%define MOV movq
+%endif
+
+cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 6+2*(%1==8), dst, dststride, src, srcstride, height, mx, picreg
+%if %1 == 4
+    mova      m3, [filter4_h6_shuf]
+%if PIC
+    lea  picregq, [sixtap_filter4_hb_m]
+%endif
+    shl      mxd, 4
+    mova      m4, [sixtap_filter4_hb+mxq-32]
+    mova      m5, [sixtap_filter4_hb+mxq-16]
+%else
    lea      mxd, [mxq*3]
    mova      m3, [filter_h6_shuf2]
    mova      m4, [filter_h6_shuf3]
 %if PIC
-    lea  picregq, [sixtap_filter_hb_m]
+    lea  picregq, [sixtap_filter_b_m]
+%endif
+    mova      m5, [sixtap_filter_b+mxq*8-48] ; set up 6tap filter in bytes
+    mova      m6, [sixtap_filter_b+mxq*8-32]
+    mova      m7, [sixtap_filter_b+mxq*8-16]
 %endif
-    mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
-    mova      m6, [sixtap_filter_hb+mxq*8-32]
-    mova      m7, [sixtap_filter_hb+mxq*8-16]

 .nextrow:
+%if %1 == 4
+    ; we need nine bytes, so two loads
+    movq      m1, [srcq-1]
+    movq      m0, [srcq-2]
+    punpcklbw m0, m1
+    pshufb    m1, m3
+    pmaddubsw m1, m5
+    pmaddubsw m0, m4
+    movhlps   m2, m1
+%else
    movu      m0, [srcq-2]
    mova      m1, m0
    mova      m2, m0
-%if mmsize == 8
-; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
-; shuffle with a memory operand
-    punpcklbw m0, [srcq+3]
-%else
    pshufb    m0, [filter_h6_shuf1]
-%endif
    pshufb    m1, m3
    pshufb    m2, m4
    pmaddubsw m0, m5
    pmaddubsw m1, m6
    pmaddubsw m2, m7
-    paddsw    m0, m1
+%endif
+    add     srcq, srcstrideq
+    paddw     m0, m1
    paddsw    m0, m2
    pmulhrsw  m0, [pw_256]
    packuswb  m0, m0
-    movh  [dstq], m0        ; store
+    MOV   [dstq], m0        ; store

    ; go to next line
    add     dstq, dststrideq
-    add     srcq, srcstrideq
    dec  heightd            ; next row
    jg .nextrow
    RET

-cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
-    shl      mxd, 4
+cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, src, srcstride, height, mx, picreg
    mova      m2, [pw_256]
-    mova      m3, [filter_h2_shuf]
-    mova      m4, [filter_h4_shuf]
+%if %1 == 8
+    shl      mxd, 4
+    mova      m3, [filter_h4_shuf1]
+    mova      m4, [filter_h4_shuf2]
 %if PIC
-    lea  picregq, [fourtap_filter_hb_m]
+    lea  picregq, [fourtap_filter_b_m]
+%endif
+    mova      m5, [fourtap_filter_b+mxq-16] ; set up 4tap filter in bytes
+    mova      m6, [fourtap_filter_b+mxq]
+%else
+    shl      mxd, 3
+    mova      m3, [filter4_h4_shuf]
+%if PIC
+    lea  picregq, [fourtap_filter4_b_m]
+%endif
+    mova      m5, [fourtap_filter4_b+mxq-8]
 %endif
-    mova      m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
-    mova      m6, [fourtap_filter_hb+mxq]

 .nextrow:
+%if %1 == 4
+    movq      m0, [srcq-1]
+    pshufb    m0, m3
+    pmaddubsw m0, m5
+    movhlps   m1, m0
+%else
    movu      m0, [srcq-1]
    mova      m1, m0
    pshufb    m0, m3
    pshufb    m1, m4
    pmaddubsw m0, m5
    pmaddubsw m1, m6
+%endif
+    add     srcq, srcstrideq
    paddsw    m0, m1
    pmulhrsw  m0, m2
    packuswb  m0, m0
-    movh  [dstq], m0        ; store
+    MOV   [dstq], m0        ; store

    ; go to next line
    add     dstq, dststrideq
-    add     srcq, srcstrideq
    dec  heightd            ; next row
    jg .nextrow
    RET
@@ -235,71 +277,124 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
 %if PIC
-    lea  picregq, [fourtap_filter_hb_m]
+    lea  picregq, [fourtap_filter_b_m]
 %endif
-    mova      m5, [fourtap_filter_hb+myq-16]
-    mova      m6, [fourtap_filter_hb+myq]
+    mova      m5, [fourtap_filter_b+myq-16]
+    mova      m6, [fourtap_filter_b+myq]
    mova      m7, [pw_256]

    ; read 3 lines
-    sub     srcq, srcstrideq
-    movh      m0, [srcq]
-    movh      m1, [srcq+  srcstrideq]
-    movh      m2, [srcq+2*srcstrideq]
-    add     srcq, srcstrideq
+    mov  picregq, srcstrideq
+    neg  picregq
+    MOV       m0, [srcq+picregq]
+    MOV       m1, [srcq]
+    MOV       m2, [srcq+srcstrideq]
+    lea     srcq, [srcq+2*srcstrideq]
+    punpcklbw m0, m2

+%if %1 == 4
+.next2rows:
+    movd       m3, [srcq]
+    movd       m4, [srcq+srcstrideq]
+    punpcklbw  m1, m3
+    punpcklqdq m0, m1
+    punpcklbw  m2, m4
+    pmaddubsw  m0, m5
+    punpcklqdq m1, m2
+    pmaddubsw  m1, m6
+    lea     srcq, [srcq+2*srcstrideq]
+    paddsw     m1, m0
+    pmulhrsw   m1, m7
+    mova       m0, m2
+    packuswb   m1, m1
+    movd   [dstq], m1
+    mova       m2, m4
+    psrldq     m1, 4
+    movd [dstq+dststrideq], m1
+    mova       m1, m3
+    lea      dstq, [dstq+2*dststrideq]
+    sub   heightd, 2
+    jg .next2rows
+%else
 .nextrow:
-    movh      m3, [srcq+2*srcstrideq]      ; read new row
-    mova      m4, m0
+    movh      m3, [srcq]      ; read new row
+    pmaddubsw m0, m5
+    punpcklbw m1, m3
+    pmaddubsw m4, m1, m6
+    add     srcq, srcstrideq
+    paddsw    m4, m0
    mova      m0, m1
-    punpcklbw m4, m1
-    mova      m1, m2
-    punpcklbw m2, m3
-    pmaddubsw m4, m5
-    pmaddubsw m2, m6
-    paddsw    m4, m2
-    mova      m2, m3
    pmulhrsw  m4, m7
+    mova      m1, m2
    packuswb  m4, m4
+    mova      m2, m3
    movh  [dstq], m4

    ; go to next line
    add      dstq, dststrideq
-    add      srcq, srcstrideq
    dec   heightd                          ; next row
    jg .nextrow
+%endif
    RET

 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    lea      myd, [myq*3]
 %if PIC
-    lea  picregq, [sixtap_filter_hb_m]
+    lea  picregq, [sixtap_filter_b_m]
 %endif
-    lea      myq, [sixtap_filter_hb+myq*8]
+    lea      myq, [sixtap_filter_b+myq*8]

    ; read 5 lines
-    sub     srcq, srcstrideq
-    sub     srcq, srcstrideq
-    movh      m0, [srcq]
-    movh      m1, [srcq+srcstrideq]
-    movh      m2, [srcq+srcstrideq*2]
+    mov  picregq, srcstrideq
+    neg  picregq
+    MOV       m0, [srcq+2*picregq]
+    MOV       m1, [srcq+picregq]
+    MOV       m2, [srcq]
+    MOV       m3, [srcq+srcstrideq]
+    MOV       m4, [srcq+2*srcstrideq]
    lea     srcq, [srcq+srcstrideq*2]
-    add     srcq, srcstrideq
-    movh      m3, [srcq]
-    movh      m4, [srcq+srcstrideq]
+    punpcklbw m0, m3
+    punpcklbw m1, m4
+%if %1 == 4
+    punpcklqdq m0, m1
+
+.next2rows:
+    movd       m5, [srcq+srcstrideq]
+    movd       m6, [srcq+2*srcstrideq]
+    pmaddubsw  m0, [myq-48]
+    punpcklbw  m2, m5
+    punpcklqdq m1, m2
+    pmaddubsw  m1, [myq-32]
+    punpcklbw  m3, m6
+    punpcklqdq m2, m3
+    paddw      m0, m1
+    pmaddubsw  m1, m2, [myq-16]
+    lea      srcq, [srcq+2*srcstrideq]
+    paddsw     m1, m0
+    mova       m0, m2
+    pmulhrsw   m1, [pw_256]
+    mova       m2, m4
+    packuswb   m1, m1
+    movd   [dstq], m1
+    mova       m4, m6
+    psrldq     m1, 4
+    movd [dstq+dststrideq], m1
+    lea      dstq, [dstq+2*dststrideq]
+    mova       m1, m3
+    mova       m3, m5
+    sub   heightd, 2
+    jg .next2rows
+%else

 .nextrow:
-    movh      m5, [srcq+2*srcstrideq]      ; read new row
-    mova      m6, m0
-    punpcklbw m6, m5
+    movh      m5, [srcq+srcstrideq]      ; read new row
+    pmaddubsw m0, [myq-48]
+    punpcklbw m2, m5
+    pmaddubsw m6, m1, [myq-32]
+    pmaddubsw m7, m2, [myq-16]
+    add     srcq, srcstrideq
+    paddw     m6, m0
    mova      m0, m1
-    punpcklbw m1, m2
-    mova      m7, m3
-    punpcklbw m7, m4
-    pmaddubsw m6, [myq-48]
-    pmaddubsw m1, [myq-32]
-    pmaddubsw m7, [myq-16]
-    paddsw    m6, m1
    paddsw    m6, m7
    mova      m1, m2
    mova      m2, m3
@@ -311,123 +406,16 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr

    ; go to next line
    add      dstq, dststrideq
-    add      srcq, srcstrideq
    dec   heightd                          ; next row
    jg .nextrow
+%endif
    RET
 %endmacro

-INIT_MMX ssse3
-FILTER_SSSE3 4
 INIT_XMM ssse3
+FILTER_SSSE3 4
 FILTER_SSSE3 8

-; 4x4 block, H-only 4-tap filter
-INIT_MMX mmxext
-cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
-    shl       mxd, 4
-%if PIC
-    lea   picregq, [fourtap_filter_hw_m]
-%endif
-    movq      mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
-    movq      mm5, [fourtap_filter_hw+mxq]
-    movq      mm7, [pw_64]
-    pxor      mm6, mm6
-
-.nextrow:
-    movq      mm1, [srcq-1]                ; (ABCDEFGH) load 8 horizontal pixels
-
-    ; first set of 2 pixels
-    movq      mm2, mm1                     ; byte ABCD..
-    punpcklbw mm1, mm6                     ; byte->word ABCD
-    pshufw    mm0, mm2, 9                  ; byte CDEF..
-    punpcklbw mm0, mm6                     ; byte->word CDEF
-    pshufw    mm3, mm1, 0x94               ; word ABBC
-    pshufw    mm1, mm0, 0x94               ; word CDDE
-    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
-    movq      mm0, mm1                     ; backup for second set of pixels
-    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
-    paddd     mm3, mm1                     ; finish 1st 2px
-
-    ; second set of 2 pixels, use backup of above
-    punpckhbw mm2, mm6                     ; byte->word EFGH
-    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
-    pshufw    mm1, mm2, 0x94               ; word EFFG
-    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
-    paddd     mm0, mm1                     ; finish 2nd 2px
-
-    ; merge two sets of 2 pixels into one set of 4, round/clip/store
-    packssdw  mm3, mm0                     ; merge dword->word (4px)
-    paddsw    mm3, mm7                     ; rounding
-    psraw     mm3, 7
-    packuswb  mm3, mm6                     ; clip and word->bytes
-    movd   [dstq], mm3                     ; store
-
-    ; go to next line
-    add      dstq, dststrideq
-    add      srcq, srcstrideq
-    dec   heightd                          ; next row
-    jg .nextrow
-    RET
-
-; 4x4 block, H-only 6-tap filter
-INIT_MMX mmxext
-cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
-    lea       mxd, [mxq*3]
-%if PIC
-    lea   picregq, [sixtap_filter_hw_m]
-%endif
-    movq      mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
-    movq      mm5, [sixtap_filter_hw+mxq*8-32]
-    movq      mm6, [sixtap_filter_hw+mxq*8-16]
-    movq      mm7, [pw_64]
-    pxor      mm3, mm3
-
-.nextrow:
-    movq      mm1, [srcq-2]                ; (ABCDEFGH) load 8 horizontal pixels
-
-    ; first set of 2 pixels
-    movq      mm2, mm1                     ; byte ABCD..
-    punpcklbw mm1, mm3                     ; byte->word ABCD
-    pshufw    mm0, mm2, 0x9                ; byte CDEF..
-    punpckhbw mm2, mm3                     ; byte->word EFGH
-    punpcklbw mm0, mm3                     ; byte->word CDEF
-    pshufw    mm1, mm1, 0x94               ; word ABBC
-    pshufw    mm2, mm2, 0x94               ; word EFFG
-    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
-    pshufw    mm3, mm0, 0x94               ; word CDDE
-    movq      mm0, mm3                     ; backup for second set of pixels
-    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
-    paddd     mm1, mm3                     ; add to 1st 2px cache
-    movq      mm3, mm2                     ; backup for second set of pixels
-    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
-    paddd     mm1, mm2                     ; finish 1st 2px
-
-    ; second set of 2 pixels, use backup of above
-    movd      mm2, [srcq+3]                ; byte FGHI (prevent overreads)
-    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
-    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
-    paddd     mm0, mm3                     ; add to 2nd 2px cache
-    pxor      mm3, mm3
-    punpcklbw mm2, mm3                     ; byte->word FGHI
-    pshufw    mm2, mm2, 0xE9               ; word GHHI
-    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
-    paddd     mm0, mm2                     ; finish 2nd 2px
-
-    ; merge two sets of 2 pixels into one set of 4, round/clip/store
-    packssdw  mm1, mm0                     ; merge dword->word (4px)
-    paddsw    mm1, mm7                     ; rounding
-    psraw     mm1, 7
-    packuswb  mm1, mm3                     ; clip and word->bytes
-    movd   [dstq], mm1                     ; store
-
-    ; go to next line
-    add      dstq, dststrideq
-    add      srcq, srcstrideq
-    dec   heightd                          ; next row
-    jg .nextrow
-    RET
-
 INIT_XMM sse2
 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
    shl      mxd, 5
@@ -461,17 +449,17 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h
    pmullw    m2, [mxq+32]
    pmullw    m3, [mxq+48]
 %endif
-    paddsw    m0, m1
-    paddsw    m2, m3
+    add     srcq, srcstrideq
+    paddw     m0, m1
+    paddw     m2, m3
+    paddw     m0, m4
    paddsw    m0, m2
-    paddsw    m0, m4
    psraw     m0, 7
    packuswb  m0, m7
    movh  [dstq], m0        ; store

    ; go to next line
    add     dstq, dststrideq
-    add     srcq, srcstrideq
    dec  heightd            ; next row
    jg .nextrow
    RET
@@ -522,26 +510,26 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h
    pmullw    m4, [mxq+64]
    pmullw    m5, [mxq+80]
 %endif
-    paddsw    m1, m4
-    paddsw    m0, m5
-    paddsw    m1, m2
-    paddsw    m0, m3
+    add     srcq, srcstrideq
+    paddw     m1, m4
+    paddw     m0, m5
+    paddw     m1, m2
+    paddw     m0, m3
+    paddw     m1, m6
    paddsw    m0, m1
-    paddsw    m0, m6
    psraw     m0, 7
    packuswb  m0, m7
    movh  [dstq], m0        ; store

    ; go to next line
    add     dstq, dststrideq
-    add     srcq, srcstrideq
    dec  heightd            ; next row
    jg .nextrow
    RET

-%macro FILTER_V 1
+INIT_XMM sse2
 ; 4x4 block, V-only 4-tap filter
-cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 5
 %if PIC
    lea  picregq, [fourtap_filter_v_m]
@@ -568,33 +556,33 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
    mova      m3, m4
    pmullw    m0, [myq+0]
    pmullw    m4, m5
-    paddsw    m4, m0
+    paddw     m4, m0

    ; then calculate positive taps
    mova      m0, m1
    pmullw    m1, [myq+16]
-    paddsw    m4, m1
+    paddw     m4, m1
    mova      m1, m2
    pmullw    m2, [myq+32]
+    paddw     m4, m6
+    add     srcq, srcstrideq
    paddsw    m4, m2
    mova      m2, m3

    ; round/clip/store
-    paddsw    m4, m6
    psraw     m4, 7
    packuswb  m4, m7
    movh  [dstq], m4

    ; go to next line
    add     dstq, dststrideq
-    add     srcq, srcstrideq
    dec  heightd                           ; next row
    jg .nextrow
    RET


 ; 4x4 block, V-only 6-tap filter
-cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
+cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
    lea      myq, [myq*3]
 %if PIC
@@ -604,15 +592,14 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
    pxor      m7, m7

    ; read 5 lines
-    sub     srcq, srcstrideq
-    sub     srcq, srcstrideq
-    movh      m0, [srcq]
-    movh      m1, [srcq+srcstrideq]
-    movh      m2, [srcq+srcstrideq*2]
+    mov  picregq, srcstrideq
+    neg  picregq
+    movh      m0, [srcq+2*picregq]
+    movh      m1, [srcq+picregq]
+    movh      m2, [srcq]
+    movh      m3, [srcq+srcstrideq]
+    movh      m4, [srcq+2*srcstrideq]
    lea     srcq, [srcq+srcstrideq*2]
-    add     srcq, srcstrideq
-    movh      m3, [srcq]
-    movh      m4, [srcq+srcstrideq]
    punpcklbw m0, m7
    punpcklbw m1, m7
    punpcklbw m2, m7
@@ -625,19 +612,21 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
    pmullw    m5, [myq+16]
    mova      m6, m4
    pmullw    m6, [myq+64]
-    paddsw    m6, m5
+    paddw     m6, m5

    ; then calculate positive taps
-    movh      m5, [srcq+2*srcstrideq]      ; read new row
+    movh      m5, [srcq+srcstrideq]      ; read new row
    punpcklbw m5, m7
    pmullw    m0, [myq+0]
-    paddsw    m6, m0
+    paddw     m6, [pw_64]
+    paddw     m6, m0
    mova      m0, m1
    mova      m1, m2
    pmullw    m2, [myq+32]
-    paddsw    m6, m2
+    paddw     m6, m2
    mova      m2, m3
    pmullw    m3, [myq+48]
+    add     srcq, srcstrideq
    paddsw    m6, m3
    mova      m3, m4
    mova      m4, m5
@@ -645,23 +634,15 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
    paddsw    m6, m5

    ; round/clip/store
-    paddsw    m6, [pw_64]
    psraw     m6, 7
    packuswb  m6, m7
    movh  [dstq], m6

    ; go to next line
    add     dstq, dststrideq
-    add     srcq, srcstrideq
    dec  heightd                           ; next row
    jg .nextrow
    RET
-%endmacro
-
-INIT_MMX mmxext
-FILTER_V 4
-INIT_XMM sse2
-FILTER_V 8

 %macro FILTER_BILINEAR 1
 %if cpuflag(ssse3)
@@ -672,14 +653,15 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p
 %endif
    pxor      m4, m4
    mova      m3, [bilinear_filter_vb+myq-16]
-.nextrow:
    movh      m0, [srcq+srcstrideq*0]
+.nextrow:
    movh      m1, [srcq+srcstrideq*1]
    movh      m2, [srcq+srcstrideq*2]
    punpcklbw m0, m1
    punpcklbw m1, m2
    pmaddubsw m0, m3
    pmaddubsw m1, m3
+    lea     srcq, [srcq+srcstrideq*2]
    psraw     m0, 2
    psraw     m1, 2
    pavgw     m0, m4
@@ -694,6 +676,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
 %endif
+    mova      m0, m2
 %else ; cpuflag(ssse3)
 cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
    shl      myd, 4
@@ -716,26 +699,19 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
    pmullw    m1, m5
    pmullw    m2, m4
    pmullw    m3, m5
-    paddsw    m0, m1
-    paddsw    m2, m3
+    lea     srcq, [srcq+srcstrideq*2]
+    paddw     m0, m1
+    paddw     m2, m3
    psraw     m0, 2
    psraw     m2, 2
    pavgw     m0, m6
    pavgw     m2, m6
-%if mmsize == 8
-    packuswb  m0, m0
-    packuswb  m2, m2
-    movh   [dstq+dststrideq*0], m0
-    movh   [dstq+dststrideq*1], m2
-%else
    packuswb  m0, m2
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
-%endif
 %endif ; cpuflag(ssse3)

    lea     dstq, [dstq+dststrideq*2]
-    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
    jg .nextrow
    RET
@@ -756,6 +732,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride
    pshufb    m1, m2
    pmaddubsw m0, m3
    pmaddubsw m1, m3
+    lea     srcq, [srcq+srcstrideq*2]
    psraw     m0, 2
    psraw     m1, 2
    pavgw     m0, m4
@@ -793,33 +770,24 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
    pmullw    m1, m5
    pmullw    m2, m4
    pmullw    m3, m5
-    paddsw    m0, m1
-    paddsw    m2, m3
+    lea     srcq, [srcq+srcstrideq*2]
+    paddw     m0, m1
+    paddw     m2, m3
    psraw     m0, 2
    psraw     m2, 2
    pavgw     m0, m6
    pavgw     m2, m6
-%if mmsize == 8
-    packuswb  m0, m0
-    packuswb  m2, m2
-    movh   [dstq+dststrideq*0], m0
-    movh   [dstq+dststrideq*1], m2
-%else
    packuswb  m0, m2
    movh   [dstq+dststrideq*0], m0
    movhps [dstq+dststrideq*1], m0
-%endif
 %endif ; cpuflag(ssse3)

    lea     dstq, [dstq+dststrideq*2]
-    lea     srcq, [srcq+srcstrideq*2]
    sub  heightd, 2
    jg .nextrow
    RET
 %endmacro

-INIT_MMX mmxext
-FILTER_BILINEAR 4
 INIT_XMM sse2
 FILTER_BILINEAR 8
 INIT_MMX ssse3
@@ -827,14 +795,22 @@ FILTER_BILINEAR 4
 INIT_XMM ssse3
 FILTER_BILINEAR 8

-INIT_MMX mmx
-cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
+INIT_XMM sse2
+cglobal put_vp8_pixels8, 5, 5+2*ARCH_X86_64, 2, dst, dststride, src, srcstride, height
 .nextrow:
-    movq    mm0, [srcq+srcstrideq*0]
-    movq    mm1, [srcq+srcstrideq*1]
+%if ARCH_X86_64
+    mov     r5q, [srcq+srcstrideq*0]
+    mov     r6q, [srcq+srcstrideq*1]
    lea    srcq, [srcq+srcstrideq*2]
-    movq [dstq+dststrideq*0], mm0
-    movq [dstq+dststrideq*1], mm1
+    mov [dstq+dststrideq*0], r5q
+    mov [dstq+dststrideq*1], r6q
+%else
+    movq     m0, [srcq+srcstrideq*0]
+    movq     m1, [srcq+srcstrideq*1]
+    lea    srcq, [srcq+srcstrideq*2]
+    movq [dstq+dststrideq*0], m0
+    movq [dstq+dststrideq*1], m1
+%endif
    lea    dstq, [dstq+dststrideq*2]
    sub heightd, 2
    jg .nextrow
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -29,19 +29,6 @@
 /*
 * MC functions
 */
-void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                const uint8_t *src, ptrdiff_t srcstride,
-                                int height, int mx, int my);
-void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                const uint8_t *src, ptrdiff_t srcstride,
-                                int height, int mx, int my);
-void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                const uint8_t *src, ptrdiff_t srcstride,
-                                int height, int mx, int my);
-void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                const uint8_t *src, ptrdiff_t srcstride,
-                                int height, int mx, int my);
-
 void ff_put_vp8_epel8_h4_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                const uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);
@@ -80,9 +67,6 @@ void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                const uint8_t *src, ptrdiff_t srcstride,
                                int height, int mx, int my);

-void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                   const uint8_t *src, ptrdiff_t srcstride,
-                                   int height, int mx, int my);
 void ff_put_vp8_bilinear8_h_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                   const uint8_t *src, ptrdiff_t srcstride,
                                   int height, int mx, int my);
@@ -93,9 +77,6 @@ void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                   const uint8_t *src, ptrdiff_t srcstride,
                                   int height, int mx, int my);

-void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
-                                   const uint8_t *src, ptrdiff_t srcstride,
-                                   int height, int mx, int my);
 void ff_put_vp8_bilinear8_v_sse2  (uint8_t *dst, ptrdiff_t dststride,
                                   const uint8_t *src, ptrdiff_t srcstride,
                                   int height, int mx, int my);
@@ -107,7 +88,7 @@ void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
                                   int height, int mx, int my);


-void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
+void ff_put_vp8_pixels8_sse2(uint8_t *dst, ptrdiff_t dststride,
                             const uint8_t *src, ptrdiff_t srcstride,
                             int height, int mx, int my);
 void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
@@ -124,16 +105,6 @@ static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
    ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
        dst + 8, dststride, src + 8, srcstride, height, mx, my); \
 }
-#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
-static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
-    uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
-    ptrdiff_t srcstride, int height, int mx, int my) \
-{ \
-    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
-        dst,     dststride, src,     srcstride, height, mx, my); \
-    ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
-        dst + 4, dststride, src + 4, srcstride, height, mx, my); \
-}

 TAP_W16(sse2,  epel, h6)
 TAP_W16(sse2,  epel, v6)
@@ -159,14 +130,6 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
        dst, dststride, tmpptr, SIZE,      height,               mx, my); \
 }

-#define HVTAPMMX(x, y) \
-HVTAP(mmxext, 8, x, y,  4,  8)
-
-HVTAPMMX(4, 4)
-HVTAPMMX(4, 6)
-HVTAPMMX(6, 4)
-HVTAPMMX(6, 6)
-
 #define HVTAPSSE2(x, y, w) \
 HVTAP(sse2,  16, x, y, w, 16) \
 HVTAP(ssse3, 16, x, y, w, 16)
@@ -194,7 +157,6 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
        dst, dststride, tmp, SIZE,      height,     mx, my); \
 }

-HVBILIN(mmxext,  8,  4,  8)
 HVBILIN(sse2,  8,  8, 16)
 HVBILIN(sse2,  8, 16, 16)
 HVBILIN(ssse3, 8,  4,  8)
@@ -280,30 +242,22 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
 {
    int cpu_flags = av_get_cpu_flags();

-    if (EXTERNAL_MMX(cpu_flags)) {
-        c->put_vp8_epel_pixels_tab[1][0][0]     =
-        c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
-    }
-
-    /* note that 4-tap width=16 functions are missing because w=16
-     * is only used for luma, and luma is always a copy or sixtap. */
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        VP8_MC_FUNC(2, 4, mmxext);
-        VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
-    }
-
    if (EXTERNAL_SSE(cpu_flags)) {
        c->put_vp8_epel_pixels_tab[0][0][0]     =
        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
    }

    if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
+        c->put_vp8_epel_pixels_tab[1][0][0]     =
+        c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_sse2;
        VP8_LUMA_MC_FUNC(0, 16, sse2);
        VP8_MC_FUNC(1, 8, sse2);
        VP8_BILINEAR_MC_FUNC(0, 16, sse2);
        VP8_BILINEAR_MC_FUNC(1, 8, sse2);
    }

+    /* note that 4-tap width=16 functions are missing because w=16
+     * is only used for luma, and luma is always a copy or sixtap. */
    if (EXTERNAL_SSSE3(cpu_flags)) {
        VP8_LUMA_MC_FUNC(0, 16, ssse3);
        VP8_MC_FUNC(1, 8, ssse3);
--- a/libavfilter/f_sidedata.c
+++ b/libavfilter/f_sidedata.c
@@ -27,10 +27,8 @@
 #include "libavutil/internal.h"
 #include "libavutil/frame.h"
 #include "libavutil/opt.h"
-#include "audio.h"
 #include "avfilter.h"
 #include "filters.h"
-#include "video.h"

 enum SideDataMode {
    SIDEDATA_SELECT,
@@ -96,6 +94,31 @@ static av_cold int init(AVFilterContext *ctx)
    return 0;
 }

+static int config_props(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    SideDataContext *s = ctx->priv;
+    const AVFrameSideData *sd = NULL;
+
+    if (s->type != -1)
+       sd = av_frame_side_data_get(outlink->side_data, outlink->nb_side_data, s->type);
+
+    switch (s->mode) {
+    case SIDEDATA_SELECT:
+        break;
+    case SIDEDATA_DELETE:
+        if (s->type == -1)
+            av_frame_side_data_free(&outlink->side_data, &outlink->nb_side_data);
+        else if (sd)
+            av_frame_side_data_remove(&outlink->side_data, &outlink->nb_side_data, s->type);
+        break;
+    default:
+        av_assert0(0);
+    };
+
+    return 0;
+}
+
 static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
 {
    AVFilterContext *ctx = inlink->dst;
@@ -143,6 +166,14 @@ static const AVFilterPad ainputs[] = {
    },
 };

+static const AVFilterPad aoutputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_AUDIO,
+        .config_props = config_props,
+    },
+};
+
 const FFFilter ff_af_asidedata = {
    .p.name        = "asidedata",
    .p.description = NULL_IF_CONFIG_SMALL("Manipulate audio frame side data."),
@@ -152,7 +183,7 @@ const FFFilter ff_af_asidedata = {
    .priv_size     = sizeof(SideDataContext),
    .init          = init,
    FILTER_INPUTS(ainputs),
-    FILTER_OUTPUTS(ff_audio_default_filterpad),
+    FILTER_OUTPUTS(aoutputs),
 };
 #endif /* CONFIG_ASIDEDATA_FILTER */

@@ -169,6 +200,14 @@ static const AVFilterPad inputs[] = {
    },
 };

+static const AVFilterPad outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = config_props,
+    },
+};
+
 const FFFilter ff_vf_sidedata = {
    .p.name        = "sidedata",
    .p.description = NULL_IF_CONFIG_SMALL("Manipulate video frame side data."),
@@ -178,6 +217,6 @@ const FFFilter ff_vf_sidedata = {
    .priv_size   = sizeof(SideDataContext),
    .init        = init,
    FILTER_INPUTS(inputs),
-    FILTER_OUTPUTS(ff_video_default_filterpad),
+    FILTER_OUTPUTS(outputs),
 };
 #endif /* CONFIG_SIDEDATA_FILTER */
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -7867,8 +7867,11 @@ static int mov_init_iamf_track(AVFormatContext *s)
        default:
            av_assert0(0);
        }
-        if (ret < 0)
+        if (ret < 0) {
+            ff_iamf_uninit_context(iamf);
+            av_free(iamf);
            return ret;
+        }
    }

    track = &mov->tracks[first_iamf_idx];
--- a/tests/checkasm/vp8dsp.c
+++ b/tests/checkasm/vp8dsp.c
@@ -510,7 +510,8 @@ static void checkasm_check_vp78dsp(VP8DSPContext *d, bool is_vp7)

 void checkasm_check_vp8dsp(void)
 {
-    VP8DSPContext d;
+    // Needs to be zeroed because not all size 16 epel functions exist.
+    VP8DSPContext d = { 0 };

    ff_vp78dsp_init(&d);
    check_mc(&d);
Author	SHA1	Message	Date
James Almer	52c84b06d5	avfilter/f_sidedata: also handle global side data in filter links Should fix issue #21071 Signed-off-by: James Almer <jamrial@gmail.com>	2025-12-04 13:50:45 -03:00
Andreas Rheinhardt	e0845ec2cf	avformat/movenc: Fix leak of IAMFContext on error Forgotten in `5b87869c09`. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 16:15:09 +00:00
Lynne	f80addbb07	ffv1enc_vulkan: fix encoding with large contexts When RGB_LINECACHE == 2, then top2 is not the current line.	2025-12-04 16:53:58 +01:00
Andreas Rheinhardt	4b6e40a298	avcodec/vp8dsp: Don't compile unused functions The width 16 epel functions never use four taps in any direction, so don't build said functions. Saves 4352B of .text and 89B of .text.unlikely here. : mx and my in vp8_mc_luma() are always even. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	9cff236e2f	avcodec/riscv/vp8dsp_rvv: Remove unused functions Only the sixtap functions are used for size 16. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	050c80a526	avcodec/x86/vp8dsp: Don't use saturated addition when unnecessary For the epel functions, there can be no overflow as long as the sum contains only one of the two large central coefficients; for bilinear functions, there can be no overflow whatsoever. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	575e9e9c08	avcodec/x86/vp8dsp: Reduce number of coefficient tables By changing the permutations used in the epel8_h{4,6} case we can simply reuse the coefficient tables from the vertical epel filters. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	99fb257f58	avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_h6_ssse3 Doubling the register width allowed to avoid a pshufb and a pmaddubsw. Old benchmarks: vp8_put_epel4_h6_c: 115.9 ( 1.00x) vp8_put_epel4_h6_ssse3: 20.2 ( 5.74x) vp8_put_epel4_h6v4_c: 276.3 ( 1.00x) vp8_put_epel4_h6v4_ssse3: 58.6 ( 4.71x) vp8_put_epel4_h6v6_c: 363.6 ( 1.00x) vp8_put_epel4_h6v6_ssse3: 62.5 ( 5.82x) New benchmarks: vp8_put_epel4_h6_c: 116.4 ( 1.00x) vp8_put_epel4_h6_ssse3: 16.0 ( 7.29x) vp8_put_epel4_h6v4_c: 280.9 ( 1.00x) vp8_put_epel4_h6v4_ssse3: 44.3 ( 6.33x) vp8_put_epel4_h6v6_c: 365.6 ( 1.00x) vp8_put_epel4_h6v6_ssse3: 53.1 ( 6.89x) Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	3135bc0d3a	avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_h4_ssse3 Doubling the register width allows to use only one pshufb and pmaddubsw. Old benchmarks: vp8_put_epel4_h4_c: 82.8 ( 1.00x) vp8_put_epel4_h4_ssse3: 13.9 ( 5.96x) New benchmarks: vp8_put_epel4_h4_c: 82.7 ( 1.00x) vp8_put_epel4_h4_ssse3: 11.7 ( 7.08x) Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	714cbf1c70	avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_v4_ssse3 Switching to xmm registers allows to process two rows in parallel, leading to speedups. It is also ABI compliant (no more missing emms). Old benchmarks: vp8_put_epel4_v4_c: 96.8 ( 1.00x) vp8_put_epel4_v4_ssse3: 28.2 ( 3.43x) New benchmarks: vp8_put_epel4_v4_c: 95.1 ( 1.00x) vp8_put_epel4_v4_ssse3: 22.8 ( 4.17x) Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	f017806829	avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_v6_ssse3 Switching to xmm registers allows to process two rows in parallel, leading to speedups. It is also ABI compliant (no more missing emms). Old benchmarks: vp8_put_epel4_v6_c: 132.8 ( 1.00x) vp8_put_epel4_v6_ssse3: 34.3 ( 3.87x) New benchmarks: vp8_put_epel4_v6_c: 131.5 ( 1.00x) vp8_put_epel4_v6_ssse3: 27.1 ( 4.86x) Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	7411998757	avcodec/x86/vp8dsp: Avoid unpacking multiple times Always pair row i with row i+2 for the vertical four-tap filter and row i+3 for the vertical six-tap filter (instead of pairing the first with the sixth, the second with the third and the fourth and the fifth). This allows to unpack each row only once instead of (at most) three times. Old benchmarks: vp8_put_epel4_v4_c: 98.4 ( 1.00x) vp8_put_epel4_v4_ssse3: 28.6 ( 3.44x) vp8_put_epel4_v6_c: 131.6 ( 1.00x) vp8_put_epel4_v6_ssse3: 38.5 ( 3.42x) vp8_put_epel8_v4_c: 362.5 ( 1.00x) vp8_put_epel8_v4_sse2: 63.8 ( 5.68x) vp8_put_epel8_v4_ssse3: 44.4 ( 8.16x) vp8_put_epel8_v6_c: 538.3 ( 1.00x) vp8_put_epel8_v6_sse2: 86.5 ( 6.22x) vp8_put_epel8_v6_ssse3: 57.0 ( 9.44x) vp8_put_epel16_v6_c: 1044.6 ( 1.00x) vp8_put_epel16_v6_sse2: 158.0 ( 6.61x) vp8_put_epel16_v6_ssse3: 106.7 ( 9.79x) New benchmarks: vp8_put_epel4_v4_c: 100.0 ( 1.00x) vp8_put_epel4_v4_ssse3: 28.4 ( 3.52x) vp8_put_epel4_v6_c: 131.7 ( 1.00x) vp8_put_epel4_v6_ssse3: 34.3 ( 3.84x) vp8_put_epel8_v4_c: 364.4 ( 1.00x) vp8_put_epel8_v4_sse2: 63.7 ( 5.72x) vp8_put_epel8_v4_ssse3: 43.3 ( 8.42x) vp8_put_epel8_v6_c: 550.2 ( 1.00x) vp8_put_epel8_v6_sse2: 86.4 ( 6.37x) vp8_put_epel8_v6_ssse3: 52.9 (10.40x) vp8_put_epel16_v6_c: 1052.5 ( 1.00x) vp8_put_epel16_v6_sse2: 158.3 ( 6.65x) vp8_put_epel16_v6_ssse3: 98.9 (10.64x) Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	24cdd4100d	avcodec/x86/vp8dsp_init: Remove unused macro Forgotten in `6a551f1405`. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	76900089fb	avcodec/x86/vp8dsp: Avoid reload Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	86aa1b81ec	avcodec/x86/vp8dsp: Increment src pointer earlier Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	e59ed3470d	avcodec/x86/vp8dsp: Directly use negated stride There is a register available. No change in benchmarks here. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:37 +01:00
Andreas Rheinhardt	8fb6b0c733	avcodec/x86/vp8dsp: Don't use MMX registers in put_vp8_pixels8 Use GPRs on x64 and xmm registers else (using GPRs reduces codesize). This avoids clobbering the floating point state and therefore no longer breaks the ABI. No change in benchmarks here. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:36 +01:00
Andreas Rheinhardt	ed5e0f9c68	avcodec/x86/vp8dsp: Remove MMXEXT functions overridden by SSSE3 SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD), so that the overwhelming majority of our users (particularly those that actually update their FFmpeg) will be using the SSSE3 versions. This commit therefore removes the MMX(EXT) functions overridden by them (which don't abide by the ABI) to get closer to a removal of emms_c. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-12-04 15:17:36 +01:00
Lynne	9b14ea0aa1	vulkan_dpx: fix alignment issue 12-bit images apparently require mod-32 alignment for each line. Go figure.	2025-12-04 15:08:46 +01:00