mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-12-05 14:30:00 +01:00
Compare commits
19 Commits
d6458f6a8b
...
52c84b06d5
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
52c84b06d5 | ||
|
|
e0845ec2cf | ||
|
|
f80addbb07 | ||
|
|
4b6e40a298 | ||
|
|
9cff236e2f | ||
|
|
050c80a526 | ||
|
|
575e9e9c08 | ||
|
|
99fb257f58 | ||
|
|
3135bc0d3a | ||
|
|
714cbf1c70 | ||
|
|
f017806829 | ||
|
|
7411998757 | ||
|
|
24cdd4100d | ||
|
|
76900089fb | ||
|
|
86aa1b81ec | ||
|
|
e59ed3470d | ||
|
|
8fb6b0c733 | ||
|
|
ed5e0f9c68 | ||
|
|
9b14ea0aa1 |
@@ -90,27 +90,22 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
|
||||
c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
|
||||
c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
|
||||
c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;
|
||||
|
||||
c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
|
||||
c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
|
||||
c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
|
||||
#if __riscv_xlen <= 64
|
||||
c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
|
||||
c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
|
||||
c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
|
||||
c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
|
||||
c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
|
||||
c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
|
||||
c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
|
||||
#endif
|
||||
|
||||
@@ -537,7 +537,14 @@ func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x, zba
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.irp len,16,8,4
|
||||
# Only the sixtaps versions are used for epel16.
|
||||
epel 16 6 h
|
||||
epel 16 6 v
|
||||
#if __riscv_xlen <= 64
|
||||
epel_hv 16 6 6
|
||||
#endif
|
||||
|
||||
.irp len,8,4
|
||||
epel \len 6 h
|
||||
epel \len 4 h
|
||||
epel \len 6 v
|
||||
|
||||
@@ -558,26 +558,21 @@ put_vp8_epel ## SIZE ## _h ## HTAPS ## v ## VTAPS ## _c(uint8_t *dst, \
|
||||
} \
|
||||
}
|
||||
|
||||
VP8_EPEL_H(16, 4)
|
||||
VP8_EPEL_H(8, 4)
|
||||
VP8_EPEL_H(4, 4)
|
||||
VP8_EPEL_H(16, 6)
|
||||
VP8_EPEL_H(8, 6)
|
||||
VP8_EPEL_H(4, 6)
|
||||
VP8_EPEL_V(16, 4)
|
||||
VP8_EPEL_V(8, 4)
|
||||
VP8_EPEL_V(4, 4)
|
||||
VP8_EPEL_V(16, 6)
|
||||
VP8_EPEL_V(8, 6)
|
||||
VP8_EPEL_V(4, 6)
|
||||
|
||||
VP8_EPEL_HV(16, 4, 4)
|
||||
VP8_EPEL_HV(8, 4, 4)
|
||||
VP8_EPEL_HV(4, 4, 4)
|
||||
VP8_EPEL_HV(16, 4, 6)
|
||||
VP8_EPEL_HV(8, 4, 6)
|
||||
VP8_EPEL_HV(4, 4, 6)
|
||||
VP8_EPEL_HV(16, 6, 4)
|
||||
VP8_EPEL_HV(8, 6, 4)
|
||||
VP8_EPEL_HV(4, 6, 4)
|
||||
VP8_EPEL_HV(16, 6, 6)
|
||||
@@ -667,7 +662,11 @@ VP8_BILINEAR(4)
|
||||
|
||||
av_cold void ff_vp78dsp_init(VP8DSPContext *dsp)
|
||||
{
|
||||
VP78_MC_FUNC(0, 16);
|
||||
dsp->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_c;
|
||||
dsp->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_c;
|
||||
dsp->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_c;
|
||||
dsp->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_c;
|
||||
|
||||
VP78_MC_FUNC(1, 8);
|
||||
VP78_MC_FUNC(2, 4);
|
||||
|
||||
|
||||
@@ -44,8 +44,11 @@ i16vec4 parse_packed_in_32(ivec2 pos, int stride)
|
||||
#else
|
||||
i16vec4 parse_packed_in_32(ivec2 pos, int stride)
|
||||
{
|
||||
uint line_off = pos.y*(stride*BITS_PER_COMP*COMPONENTS +
|
||||
(need_align << 3));
|
||||
uint line_size = stride*BITS_PER_COMP*COMPONENTS;
|
||||
line_size += line_size & 31;
|
||||
line_size += need_align << 3;
|
||||
|
||||
uint line_off = pos.y*line_size;
|
||||
uint pix_off = pos.x*BITS_PER_COMP*COMPONENTS;
|
||||
|
||||
uint off = (line_off + pix_off >> 5);
|
||||
|
||||
@@ -124,8 +124,12 @@ ivec2 get_pred(readonly uimage2D pred, ivec2 sp, ivec2 off,
|
||||
}
|
||||
base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];
|
||||
|
||||
#if RGB_LINECACHE == 2
|
||||
/* top-2 became current upon swap */
|
||||
TYPE top2 = TYPE(imageLoad(pred, sp + LADDR(off))[comp]);
|
||||
#else
|
||||
TYPE top2 = TYPE(imageLoad(pred, sp + LADDR(off + ivec2(0, -2)))[comp]);
|
||||
#endif
|
||||
base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
|
||||
}
|
||||
|
||||
|
||||
@@ -402,9 +402,7 @@ static int vk_decode_dpx_init(AVCodecContext *avctx)
|
||||
|
||||
switch (dpx->pix_fmt) {
|
||||
case AV_PIX_FMT_GRAY10:
|
||||
case AV_PIX_FMT_GRAY12:
|
||||
case AV_PIX_FMT_GBRAP10:
|
||||
case AV_PIX_FMT_GBRAP12:
|
||||
case AV_PIX_FMT_UYVY422:
|
||||
case AV_PIX_FMT_YUV444P:
|
||||
case AV_PIX_FMT_YUVA444P:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
;******************************************************************************
|
||||
;* VP8 MMXEXT optimizations
|
||||
;* VP8 ASM optimizations
|
||||
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
|
||||
;*
|
||||
@@ -24,43 +24,43 @@
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
fourtap_filter_hw_m: times 4 dw -6, 123
|
||||
times 4 dw 12, -1
|
||||
times 4 dw -9, 93
|
||||
times 4 dw 50, -6
|
||||
times 4 dw -6, 50
|
||||
times 4 dw 93, -9
|
||||
times 4 dw -1, 12
|
||||
times 4 dw 123, -6
|
||||
fourtap_filter4_b_m: times 4 db -6, 123
|
||||
times 4 db 12, -1
|
||||
times 4 db -9, 93
|
||||
times 4 db 50, -6
|
||||
times 4 db -6, 50
|
||||
times 4 db 93, -9
|
||||
times 4 db -1, 12
|
||||
times 4 db 123, -6
|
||||
|
||||
sixtap_filter_hw_m: times 4 dw 2, -11
|
||||
times 4 dw 108, 36
|
||||
times 4 dw -8, 1
|
||||
times 4 dw 3, -16
|
||||
times 4 dw 77, 77
|
||||
times 4 dw -16, 3
|
||||
times 4 dw 1, -8
|
||||
times 4 dw 36, 108
|
||||
times 4 dw -11, 2
|
||||
sixtap_filter4_hb_m: times 8 db 2, -11
|
||||
times 4 db 108, -8
|
||||
times 4 db 36, 1
|
||||
times 8 db 3, -16
|
||||
times 4 db 77, -16
|
||||
times 4 db 77, 3
|
||||
times 8 db 1, -8
|
||||
times 4 db 36, -11
|
||||
times 4 db 108, 2
|
||||
|
||||
fourtap_filter_hb_m: times 8 db -6, 123
|
||||
times 8 db 12, -1
|
||||
times 8 db -9, 93
|
||||
times 8 db 50, -6
|
||||
times 8 db -6, 50
|
||||
times 8 db 93, -9
|
||||
times 8 db -1, 12
|
||||
times 8 db 123, -6
|
||||
fourtap_filter_b_m: times 8 db -6, 12
|
||||
times 8 db 123, -1
|
||||
times 8 db -9, 50
|
||||
times 8 db 93, -6
|
||||
times 8 db -6, 93
|
||||
times 8 db 50, -9
|
||||
times 8 db -1, 123
|
||||
times 8 db 12, -6
|
||||
|
||||
sixtap_filter_hb_m: times 8 db 2, 1
|
||||
times 8 db -11, 108
|
||||
times 8 db 36, -8
|
||||
times 8 db 3, 3
|
||||
times 8 db -16, 77
|
||||
times 8 db 77, -16
|
||||
times 8 db 1, 2
|
||||
times 8 db -8, 36
|
||||
times 8 db 108, -11
|
||||
sixtap_filter_b_m: times 8 db 2, 36
|
||||
times 8 db -11, -8
|
||||
times 8 db 108, 1
|
||||
times 8 db 3, 77
|
||||
times 8 db -16, -16
|
||||
times 8 db 77, 3
|
||||
times 8 db 1, 108
|
||||
times 8 db -8, -11
|
||||
times 8 db 36, 2
|
||||
|
||||
fourtap_filter_v_m: times 8 dw -6
|
||||
times 8 dw 123
|
||||
@@ -115,20 +115,20 @@ bilinear_filter_vb_m: times 8 db 7, 1
|
||||
times 8 db 1, 7
|
||||
|
||||
%if PIC
|
||||
%define fourtap_filter_hw picregq
|
||||
%define sixtap_filter_hw picregq
|
||||
%define fourtap_filter_hb picregq
|
||||
%define sixtap_filter_hb picregq
|
||||
%define fourtap_filter_b picregq
|
||||
%define fourtap_filter4_b picregq
|
||||
%define sixtap_filter_b picregq
|
||||
%define sixtap_filter4_hb picregq
|
||||
%define fourtap_filter_v picregq
|
||||
%define sixtap_filter_v picregq
|
||||
%define bilinear_filter_vw picregq
|
||||
%define bilinear_filter_vb picregq
|
||||
%define npicregs 1
|
||||
%else
|
||||
%define fourtap_filter_hw fourtap_filter_hw_m
|
||||
%define sixtap_filter_hw sixtap_filter_hw_m
|
||||
%define fourtap_filter_hb fourtap_filter_hb_m
|
||||
%define sixtap_filter_hb sixtap_filter_hb_m
|
||||
%define fourtap_filter_b fourtap_filter_b_m
|
||||
%define fourtap_filter4_b fourtap_filter4_b_m
|
||||
%define sixtap_filter_b sixtap_filter_b_m
|
||||
%define sixtap_filter4_hb sixtap_filter4_hb_m
|
||||
%define fourtap_filter_v fourtap_filter_v_m
|
||||
%define sixtap_filter_v sixtap_filter_v_m
|
||||
%define bilinear_filter_vw bilinear_filter_vw_m
|
||||
@@ -136,12 +136,17 @@ bilinear_filter_vb_m: times 8 db 7, 1
|
||||
%define npicregs 0
|
||||
%endif
|
||||
|
||||
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
||||
filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
|
||||
filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
|
||||
filter4_h6_shuf: db 1, 3, 2, 4, 3, 5, 4, 6, 2, 4, 3, 5, 4, 6, 5, 7
|
||||
|
||||
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
|
||||
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
||||
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
|
||||
filter_h4_shuf1: db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9
|
||||
filter_h4_shuf2: db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
|
||||
|
||||
filter_h6_shuf1: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
|
||||
filter_h6_shuf2: db 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11
|
||||
filter_h6_shuf3: db 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 9, 12
|
||||
|
||||
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
||||
|
||||
pw_20091: times 4 dw 20091
|
||||
pw_17734: times 4 dw 17734
|
||||
@@ -162,72 +167,109 @@ SECTION .text
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
%macro FILTER_SSSE3 1
|
||||
cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
|
||||
%if %1 == 4
|
||||
%define MOV movd
|
||||
%else
|
||||
%define MOV movq
|
||||
%endif
|
||||
|
||||
cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 6+2*(%1==8), dst, dststride, src, srcstride, height, mx, picreg
|
||||
%if %1 == 4
|
||||
mova m3, [filter4_h6_shuf]
|
||||
%if PIC
|
||||
lea picregq, [sixtap_filter4_hb_m]
|
||||
%endif
|
||||
shl mxd, 4
|
||||
mova m4, [sixtap_filter4_hb+mxq-32]
|
||||
mova m5, [sixtap_filter4_hb+mxq-16]
|
||||
%else
|
||||
lea mxd, [mxq*3]
|
||||
mova m3, [filter_h6_shuf2]
|
||||
mova m4, [filter_h6_shuf3]
|
||||
%if PIC
|
||||
lea picregq, [sixtap_filter_hb_m]
|
||||
lea picregq, [sixtap_filter_b_m]
|
||||
%endif
|
||||
mova m5, [sixtap_filter_b+mxq*8-48] ; set up 6tap filter in bytes
|
||||
mova m6, [sixtap_filter_b+mxq*8-32]
|
||||
mova m7, [sixtap_filter_b+mxq*8-16]
|
||||
%endif
|
||||
mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
|
||||
mova m6, [sixtap_filter_hb+mxq*8-32]
|
||||
mova m7, [sixtap_filter_hb+mxq*8-16]
|
||||
|
||||
.nextrow:
|
||||
%if %1 == 4
|
||||
; we need nine bytes, so two loads
|
||||
movq m1, [srcq-1]
|
||||
movq m0, [srcq-2]
|
||||
punpcklbw m0, m1
|
||||
pshufb m1, m3
|
||||
pmaddubsw m1, m5
|
||||
pmaddubsw m0, m4
|
||||
movhlps m2, m1
|
||||
%else
|
||||
movu m0, [srcq-2]
|
||||
mova m1, m0
|
||||
mova m2, m0
|
||||
%if mmsize == 8
|
||||
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
|
||||
; shuffle with a memory operand
|
||||
punpcklbw m0, [srcq+3]
|
||||
%else
|
||||
pshufb m0, [filter_h6_shuf1]
|
||||
%endif
|
||||
pshufb m1, m3
|
||||
pshufb m2, m4
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m1, m6
|
||||
pmaddubsw m2, m7
|
||||
paddsw m0, m1
|
||||
%endif
|
||||
add srcq, srcstrideq
|
||||
paddw m0, m1
|
||||
paddsw m0, m2
|
||||
pmulhrsw m0, [pw_256]
|
||||
packuswb m0, m0
|
||||
movh [dstq], m0 ; store
|
||||
MOV [dstq], m0 ; store
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
RET
|
||||
|
||||
cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
|
||||
shl mxd, 4
|
||||
cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, src, srcstride, height, mx, picreg
|
||||
mova m2, [pw_256]
|
||||
mova m3, [filter_h2_shuf]
|
||||
mova m4, [filter_h4_shuf]
|
||||
%if %1 == 8
|
||||
shl mxd, 4
|
||||
mova m3, [filter_h4_shuf1]
|
||||
mova m4, [filter_h4_shuf2]
|
||||
%if PIC
|
||||
lea picregq, [fourtap_filter_hb_m]
|
||||
lea picregq, [fourtap_filter_b_m]
|
||||
%endif
|
||||
mova m5, [fourtap_filter_b+mxq-16] ; set up 4tap filter in bytes
|
||||
mova m6, [fourtap_filter_b+mxq]
|
||||
%else
|
||||
shl mxd, 3
|
||||
mova m3, [filter4_h4_shuf]
|
||||
%if PIC
|
||||
lea picregq, [fourtap_filter4_b_m]
|
||||
%endif
|
||||
mova m5, [fourtap_filter4_b+mxq-8]
|
||||
%endif
|
||||
mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
|
||||
mova m6, [fourtap_filter_hb+mxq]
|
||||
|
||||
.nextrow:
|
||||
%if %1 == 4
|
||||
movq m0, [srcq-1]
|
||||
pshufb m0, m3
|
||||
pmaddubsw m0, m5
|
||||
movhlps m1, m0
|
||||
%else
|
||||
movu m0, [srcq-1]
|
||||
mova m1, m0
|
||||
pshufb m0, m3
|
||||
pshufb m1, m4
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m1, m6
|
||||
%endif
|
||||
add srcq, srcstrideq
|
||||
paddsw m0, m1
|
||||
pmulhrsw m0, m2
|
||||
packuswb m0, m0
|
||||
movh [dstq], m0 ; store
|
||||
MOV [dstq], m0 ; store
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
RET
|
||||
@@ -235,71 +277,124 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
|
||||
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
||||
shl myd, 4
|
||||
%if PIC
|
||||
lea picregq, [fourtap_filter_hb_m]
|
||||
lea picregq, [fourtap_filter_b_m]
|
||||
%endif
|
||||
mova m5, [fourtap_filter_hb+myq-16]
|
||||
mova m6, [fourtap_filter_hb+myq]
|
||||
mova m5, [fourtap_filter_b+myq-16]
|
||||
mova m6, [fourtap_filter_b+myq]
|
||||
mova m7, [pw_256]
|
||||
|
||||
; read 3 lines
|
||||
sub srcq, srcstrideq
|
||||
movh m0, [srcq]
|
||||
movh m1, [srcq+ srcstrideq]
|
||||
movh m2, [srcq+2*srcstrideq]
|
||||
add srcq, srcstrideq
|
||||
mov picregq, srcstrideq
|
||||
neg picregq
|
||||
MOV m0, [srcq+picregq]
|
||||
MOV m1, [srcq]
|
||||
MOV m2, [srcq+srcstrideq]
|
||||
lea srcq, [srcq+2*srcstrideq]
|
||||
punpcklbw m0, m2
|
||||
|
||||
%if %1 == 4
|
||||
.next2rows:
|
||||
movd m3, [srcq]
|
||||
movd m4, [srcq+srcstrideq]
|
||||
punpcklbw m1, m3
|
||||
punpcklqdq m0, m1
|
||||
punpcklbw m2, m4
|
||||
pmaddubsw m0, m5
|
||||
punpcklqdq m1, m2
|
||||
pmaddubsw m1, m6
|
||||
lea srcq, [srcq+2*srcstrideq]
|
||||
paddsw m1, m0
|
||||
pmulhrsw m1, m7
|
||||
mova m0, m2
|
||||
packuswb m1, m1
|
||||
movd [dstq], m1
|
||||
mova m2, m4
|
||||
psrldq m1, 4
|
||||
movd [dstq+dststrideq], m1
|
||||
mova m1, m3
|
||||
lea dstq, [dstq+2*dststrideq]
|
||||
sub heightd, 2
|
||||
jg .next2rows
|
||||
%else
|
||||
.nextrow:
|
||||
movh m3, [srcq+2*srcstrideq] ; read new row
|
||||
mova m4, m0
|
||||
movh m3, [srcq] ; read new row
|
||||
pmaddubsw m0, m5
|
||||
punpcklbw m1, m3
|
||||
pmaddubsw m4, m1, m6
|
||||
add srcq, srcstrideq
|
||||
paddsw m4, m0
|
||||
mova m0, m1
|
||||
punpcklbw m4, m1
|
||||
mova m1, m2
|
||||
punpcklbw m2, m3
|
||||
pmaddubsw m4, m5
|
||||
pmaddubsw m2, m6
|
||||
paddsw m4, m2
|
||||
mova m2, m3
|
||||
pmulhrsw m4, m7
|
||||
mova m1, m2
|
||||
packuswb m4, m4
|
||||
mova m2, m3
|
||||
movh [dstq], m4
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
%endif
|
||||
RET
|
||||
|
||||
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
||||
lea myd, [myq*3]
|
||||
%if PIC
|
||||
lea picregq, [sixtap_filter_hb_m]
|
||||
lea picregq, [sixtap_filter_b_m]
|
||||
%endif
|
||||
lea myq, [sixtap_filter_hb+myq*8]
|
||||
lea myq, [sixtap_filter_b+myq*8]
|
||||
|
||||
; read 5 lines
|
||||
sub srcq, srcstrideq
|
||||
sub srcq, srcstrideq
|
||||
movh m0, [srcq]
|
||||
movh m1, [srcq+srcstrideq]
|
||||
movh m2, [srcq+srcstrideq*2]
|
||||
mov picregq, srcstrideq
|
||||
neg picregq
|
||||
MOV m0, [srcq+2*picregq]
|
||||
MOV m1, [srcq+picregq]
|
||||
MOV m2, [srcq]
|
||||
MOV m3, [srcq+srcstrideq]
|
||||
MOV m4, [srcq+2*srcstrideq]
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
add srcq, srcstrideq
|
||||
movh m3, [srcq]
|
||||
movh m4, [srcq+srcstrideq]
|
||||
punpcklbw m0, m3
|
||||
punpcklbw m1, m4
|
||||
%if %1 == 4
|
||||
punpcklqdq m0, m1
|
||||
|
||||
.next2rows:
|
||||
movd m5, [srcq+srcstrideq]
|
||||
movd m6, [srcq+2*srcstrideq]
|
||||
pmaddubsw m0, [myq-48]
|
||||
punpcklbw m2, m5
|
||||
punpcklqdq m1, m2
|
||||
pmaddubsw m1, [myq-32]
|
||||
punpcklbw m3, m6
|
||||
punpcklqdq m2, m3
|
||||
paddw m0, m1
|
||||
pmaddubsw m1, m2, [myq-16]
|
||||
lea srcq, [srcq+2*srcstrideq]
|
||||
paddsw m1, m0
|
||||
mova m0, m2
|
||||
pmulhrsw m1, [pw_256]
|
||||
mova m2, m4
|
||||
packuswb m1, m1
|
||||
movd [dstq], m1
|
||||
mova m4, m6
|
||||
psrldq m1, 4
|
||||
movd [dstq+dststrideq], m1
|
||||
lea dstq, [dstq+2*dststrideq]
|
||||
mova m1, m3
|
||||
mova m3, m5
|
||||
sub heightd, 2
|
||||
jg .next2rows
|
||||
%else
|
||||
|
||||
.nextrow:
|
||||
movh m5, [srcq+2*srcstrideq] ; read new row
|
||||
mova m6, m0
|
||||
punpcklbw m6, m5
|
||||
movh m5, [srcq+srcstrideq] ; read new row
|
||||
pmaddubsw m0, [myq-48]
|
||||
punpcklbw m2, m5
|
||||
pmaddubsw m6, m1, [myq-32]
|
||||
pmaddubsw m7, m2, [myq-16]
|
||||
add srcq, srcstrideq
|
||||
paddw m6, m0
|
||||
mova m0, m1
|
||||
punpcklbw m1, m2
|
||||
mova m7, m3
|
||||
punpcklbw m7, m4
|
||||
pmaddubsw m6, [myq-48]
|
||||
pmaddubsw m1, [myq-32]
|
||||
pmaddubsw m7, [myq-16]
|
||||
paddsw m6, m1
|
||||
paddsw m6, m7
|
||||
mova m1, m2
|
||||
mova m2, m3
|
||||
@@ -311,123 +406,16 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX ssse3
|
||||
FILTER_SSSE3 4
|
||||
INIT_XMM ssse3
|
||||
FILTER_SSSE3 4
|
||||
FILTER_SSSE3 8
|
||||
|
||||
; 4x4 block, H-only 4-tap filter
|
||||
INIT_MMX mmxext
|
||||
cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
|
||||
shl mxd, 4
|
||||
%if PIC
|
||||
lea picregq, [fourtap_filter_hw_m]
|
||||
%endif
|
||||
movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
|
||||
movq mm5, [fourtap_filter_hw+mxq]
|
||||
movq mm7, [pw_64]
|
||||
pxor mm6, mm6
|
||||
|
||||
.nextrow:
|
||||
movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
|
||||
|
||||
; first set of 2 pixels
|
||||
movq mm2, mm1 ; byte ABCD..
|
||||
punpcklbw mm1, mm6 ; byte->word ABCD
|
||||
pshufw mm0, mm2, 9 ; byte CDEF..
|
||||
punpcklbw mm0, mm6 ; byte->word CDEF
|
||||
pshufw mm3, mm1, 0x94 ; word ABBC
|
||||
pshufw mm1, mm0, 0x94 ; word CDDE
|
||||
pmaddwd mm3, mm4 ; multiply 2px with F0/F1
|
||||
movq mm0, mm1 ; backup for second set of pixels
|
||||
pmaddwd mm1, mm5 ; multiply 2px with F2/F3
|
||||
paddd mm3, mm1 ; finish 1st 2px
|
||||
|
||||
; second set of 2 pixels, use backup of above
|
||||
punpckhbw mm2, mm6 ; byte->word EFGH
|
||||
pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
|
||||
pshufw mm1, mm2, 0x94 ; word EFFG
|
||||
pmaddwd mm1, mm5 ; multiply 2px with F2/F3
|
||||
paddd mm0, mm1 ; finish 2nd 2px
|
||||
|
||||
; merge two sets of 2 pixels into one set of 4, round/clip/store
|
||||
packssdw mm3, mm0 ; merge dword->word (4px)
|
||||
paddsw mm3, mm7 ; rounding
|
||||
psraw mm3, 7
|
||||
packuswb mm3, mm6 ; clip and word->bytes
|
||||
movd [dstq], mm3 ; store
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
RET
|
||||
|
||||
; 4x4 block, H-only 6-tap filter
|
||||
INIT_MMX mmxext
|
||||
cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
|
||||
lea mxd, [mxq*3]
|
||||
%if PIC
|
||||
lea picregq, [sixtap_filter_hw_m]
|
||||
%endif
|
||||
movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
|
||||
movq mm5, [sixtap_filter_hw+mxq*8-32]
|
||||
movq mm6, [sixtap_filter_hw+mxq*8-16]
|
||||
movq mm7, [pw_64]
|
||||
pxor mm3, mm3
|
||||
|
||||
.nextrow:
|
||||
movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
|
||||
|
||||
; first set of 2 pixels
|
||||
movq mm2, mm1 ; byte ABCD..
|
||||
punpcklbw mm1, mm3 ; byte->word ABCD
|
||||
pshufw mm0, mm2, 0x9 ; byte CDEF..
|
||||
punpckhbw mm2, mm3 ; byte->word EFGH
|
||||
punpcklbw mm0, mm3 ; byte->word CDEF
|
||||
pshufw mm1, mm1, 0x94 ; word ABBC
|
||||
pshufw mm2, mm2, 0x94 ; word EFFG
|
||||
pmaddwd mm1, mm4 ; multiply 2px with F0/F1
|
||||
pshufw mm3, mm0, 0x94 ; word CDDE
|
||||
movq mm0, mm3 ; backup for second set of pixels
|
||||
pmaddwd mm3, mm5 ; multiply 2px with F2/F3
|
||||
paddd mm1, mm3 ; add to 1st 2px cache
|
||||
movq mm3, mm2 ; backup for second set of pixels
|
||||
pmaddwd mm2, mm6 ; multiply 2px with F4/F5
|
||||
paddd mm1, mm2 ; finish 1st 2px
|
||||
|
||||
; second set of 2 pixels, use backup of above
|
||||
movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
|
||||
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
|
||||
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
|
||||
paddd mm0, mm3 ; add to 2nd 2px cache
|
||||
pxor mm3, mm3
|
||||
punpcklbw mm2, mm3 ; byte->word FGHI
|
||||
pshufw mm2, mm2, 0xE9 ; word GHHI
|
||||
pmaddwd mm2, mm6 ; multiply 2px with F4/F5
|
||||
paddd mm0, mm2 ; finish 2nd 2px
|
||||
|
||||
; merge two sets of 2 pixels into one set of 4, round/clip/store
|
||||
packssdw mm1, mm0 ; merge dword->word (4px)
|
||||
paddsw mm1, mm7 ; rounding
|
||||
psraw mm1, 7
|
||||
packuswb mm1, mm3 ; clip and word->bytes
|
||||
movd [dstq], mm1 ; store
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
RET
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
|
||||
shl mxd, 5
|
||||
@@ -461,17 +449,17 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h
|
||||
pmullw m2, [mxq+32]
|
||||
pmullw m3, [mxq+48]
|
||||
%endif
|
||||
paddsw m0, m1
|
||||
paddsw m2, m3
|
||||
add srcq, srcstrideq
|
||||
paddw m0, m1
|
||||
paddw m2, m3
|
||||
paddw m0, m4
|
||||
paddsw m0, m2
|
||||
paddsw m0, m4
|
||||
psraw m0, 7
|
||||
packuswb m0, m7
|
||||
movh [dstq], m0 ; store
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
RET
|
||||
@@ -522,26 +510,26 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h
|
||||
pmullw m4, [mxq+64]
|
||||
pmullw m5, [mxq+80]
|
||||
%endif
|
||||
paddsw m1, m4
|
||||
paddsw m0, m5
|
||||
paddsw m1, m2
|
||||
paddsw m0, m3
|
||||
add srcq, srcstrideq
|
||||
paddw m1, m4
|
||||
paddw m0, m5
|
||||
paddw m1, m2
|
||||
paddw m0, m3
|
||||
paddw m1, m6
|
||||
paddsw m0, m1
|
||||
paddsw m0, m6
|
||||
psraw m0, 7
|
||||
packuswb m0, m7
|
||||
movh [dstq], m0 ; store
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
RET
|
||||
|
||||
%macro FILTER_V 1
|
||||
INIT_XMM sse2
|
||||
; 4x4 block, V-only 4-tap filter
|
||||
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
||||
cglobal put_vp8_epel8_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
||||
shl myd, 5
|
||||
%if PIC
|
||||
lea picregq, [fourtap_filter_v_m]
|
||||
@@ -568,33 +556,33 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
|
||||
mova m3, m4
|
||||
pmullw m0, [myq+0]
|
||||
pmullw m4, m5
|
||||
paddsw m4, m0
|
||||
paddw m4, m0
|
||||
|
||||
; then calculate positive taps
|
||||
mova m0, m1
|
||||
pmullw m1, [myq+16]
|
||||
paddsw m4, m1
|
||||
paddw m4, m1
|
||||
mova m1, m2
|
||||
pmullw m2, [myq+32]
|
||||
paddw m4, m6
|
||||
add srcq, srcstrideq
|
||||
paddsw m4, m2
|
||||
mova m2, m3
|
||||
|
||||
; round/clip/store
|
||||
paddsw m4, m6
|
||||
psraw m4, 7
|
||||
packuswb m4, m7
|
||||
movh [dstq], m4
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
RET
|
||||
|
||||
|
||||
; 4x4 block, V-only 6-tap filter
|
||||
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
||||
cglobal put_vp8_epel8_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
||||
shl myd, 4
|
||||
lea myq, [myq*3]
|
||||
%if PIC
|
||||
@@ -604,15 +592,14 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
|
||||
pxor m7, m7
|
||||
|
||||
; read 5 lines
|
||||
sub srcq, srcstrideq
|
||||
sub srcq, srcstrideq
|
||||
movh m0, [srcq]
|
||||
movh m1, [srcq+srcstrideq]
|
||||
movh m2, [srcq+srcstrideq*2]
|
||||
mov picregq, srcstrideq
|
||||
neg picregq
|
||||
movh m0, [srcq+2*picregq]
|
||||
movh m1, [srcq+picregq]
|
||||
movh m2, [srcq]
|
||||
movh m3, [srcq+srcstrideq]
|
||||
movh m4, [srcq+2*srcstrideq]
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
add srcq, srcstrideq
|
||||
movh m3, [srcq]
|
||||
movh m4, [srcq+srcstrideq]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
@@ -625,19 +612,21 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
|
||||
pmullw m5, [myq+16]
|
||||
mova m6, m4
|
||||
pmullw m6, [myq+64]
|
||||
paddsw m6, m5
|
||||
paddw m6, m5
|
||||
|
||||
; then calculate positive taps
|
||||
movh m5, [srcq+2*srcstrideq] ; read new row
|
||||
movh m5, [srcq+srcstrideq] ; read new row
|
||||
punpcklbw m5, m7
|
||||
pmullw m0, [myq+0]
|
||||
paddsw m6, m0
|
||||
paddw m6, [pw_64]
|
||||
paddw m6, m0
|
||||
mova m0, m1
|
||||
mova m1, m2
|
||||
pmullw m2, [myq+32]
|
||||
paddsw m6, m2
|
||||
paddw m6, m2
|
||||
mova m2, m3
|
||||
pmullw m3, [myq+48]
|
||||
add srcq, srcstrideq
|
||||
paddsw m6, m3
|
||||
mova m3, m4
|
||||
mova m4, m5
|
||||
@@ -645,23 +634,15 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
|
||||
paddsw m6, m5
|
||||
|
||||
; round/clip/store
|
||||
paddsw m6, [pw_64]
|
||||
psraw m6, 7
|
||||
packuswb m6, m7
|
||||
movh [dstq], m6
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
FILTER_V 4
|
||||
INIT_XMM sse2
|
||||
FILTER_V 8
|
||||
|
||||
%macro FILTER_BILINEAR 1
|
||||
%if cpuflag(ssse3)
|
||||
@@ -672,14 +653,15 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p
|
||||
%endif
|
||||
pxor m4, m4
|
||||
mova m3, [bilinear_filter_vb+myq-16]
|
||||
.nextrow:
|
||||
movh m0, [srcq+srcstrideq*0]
|
||||
.nextrow:
|
||||
movh m1, [srcq+srcstrideq*1]
|
||||
movh m2, [srcq+srcstrideq*2]
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m1, m2
|
||||
pmaddubsw m0, m3
|
||||
pmaddubsw m1, m3
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
psraw m0, 2
|
||||
psraw m1, 2
|
||||
pavgw m0, m4
|
||||
@@ -694,6 +676,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p
|
||||
movh [dstq+dststrideq*0], m0
|
||||
movhps [dstq+dststrideq*1], m0
|
||||
%endif
|
||||
mova m0, m2
|
||||
%else ; cpuflag(ssse3)
|
||||
cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
|
||||
shl myd, 4
|
||||
@@ -716,26 +699,19 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
|
||||
pmullw m1, m5
|
||||
pmullw m2, m4
|
||||
pmullw m3, m5
|
||||
paddsw m0, m1
|
||||
paddsw m2, m3
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
paddw m0, m1
|
||||
paddw m2, m3
|
||||
psraw m0, 2
|
||||
psraw m2, 2
|
||||
pavgw m0, m6
|
||||
pavgw m2, m6
|
||||
%if mmsize == 8
|
||||
packuswb m0, m0
|
||||
packuswb m2, m2
|
||||
movh [dstq+dststrideq*0], m0
|
||||
movh [dstq+dststrideq*1], m2
|
||||
%else
|
||||
packuswb m0, m2
|
||||
movh [dstq+dststrideq*0], m0
|
||||
movhps [dstq+dststrideq*1], m0
|
||||
%endif
|
||||
%endif ; cpuflag(ssse3)
|
||||
|
||||
lea dstq, [dstq+dststrideq*2]
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
sub heightd, 2
|
||||
jg .nextrow
|
||||
RET
|
||||
@@ -756,6 +732,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride
|
||||
pshufb m1, m2
|
||||
pmaddubsw m0, m3
|
||||
pmaddubsw m1, m3
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
psraw m0, 2
|
||||
psraw m1, 2
|
||||
pavgw m0, m4
|
||||
@@ -793,33 +770,24 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
|
||||
pmullw m1, m5
|
||||
pmullw m2, m4
|
||||
pmullw m3, m5
|
||||
paddsw m0, m1
|
||||
paddsw m2, m3
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
paddw m0, m1
|
||||
paddw m2, m3
|
||||
psraw m0, 2
|
||||
psraw m2, 2
|
||||
pavgw m0, m6
|
||||
pavgw m2, m6
|
||||
%if mmsize == 8
|
||||
packuswb m0, m0
|
||||
packuswb m2, m2
|
||||
movh [dstq+dststrideq*0], m0
|
||||
movh [dstq+dststrideq*1], m2
|
||||
%else
|
||||
packuswb m0, m2
|
||||
movh [dstq+dststrideq*0], m0
|
||||
movhps [dstq+dststrideq*1], m0
|
||||
%endif
|
||||
%endif ; cpuflag(ssse3)
|
||||
|
||||
lea dstq, [dstq+dststrideq*2]
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
sub heightd, 2
|
||||
jg .nextrow
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
FILTER_BILINEAR 4
|
||||
INIT_XMM sse2
|
||||
FILTER_BILINEAR 8
|
||||
INIT_MMX ssse3
|
||||
@@ -827,14 +795,22 @@ FILTER_BILINEAR 4
|
||||
INIT_XMM ssse3
|
||||
FILTER_BILINEAR 8
|
||||
|
||||
INIT_MMX mmx
|
||||
cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
|
||||
INIT_XMM sse2
|
||||
cglobal put_vp8_pixels8, 5, 5+2*ARCH_X86_64, 2, dst, dststride, src, srcstride, height
|
||||
.nextrow:
|
||||
movq mm0, [srcq+srcstrideq*0]
|
||||
movq mm1, [srcq+srcstrideq*1]
|
||||
%if ARCH_X86_64
|
||||
mov r5q, [srcq+srcstrideq*0]
|
||||
mov r6q, [srcq+srcstrideq*1]
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
movq [dstq+dststrideq*0], mm0
|
||||
movq [dstq+dststrideq*1], mm1
|
||||
mov [dstq+dststrideq*0], r5q
|
||||
mov [dstq+dststrideq*1], r6q
|
||||
%else
|
||||
movq m0, [srcq+srcstrideq*0]
|
||||
movq m1, [srcq+srcstrideq*1]
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
movq [dstq+dststrideq*0], m0
|
||||
movq [dstq+dststrideq*1], m1
|
||||
%endif
|
||||
lea dstq, [dstq+dststrideq*2]
|
||||
sub heightd, 2
|
||||
jg .nextrow
|
||||
|
||||
@@ -29,19 +29,6 @@
|
||||
/*
|
||||
* MC functions
|
||||
*/
|
||||
void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
@@ -80,9 +67,6 @@ void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
@@ -93,9 +77,6 @@ void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
@@ -107,7 +88,7 @@ void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
int height, int mx, int my);
|
||||
|
||||
|
||||
void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
|
||||
void ff_put_vp8_pixels8_sse2(uint8_t *dst, ptrdiff_t dststride,
|
||||
const uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
|
||||
@@ -124,16 +105,6 @@ static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst + 8, dststride, src + 8, srcstride, height, mx, my); \
|
||||
}
|
||||
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
|
||||
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
|
||||
ptrdiff_t srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst, dststride, src, srcstride, height, mx, my); \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst + 4, dststride, src + 4, srcstride, height, mx, my); \
|
||||
}
|
||||
|
||||
TAP_W16(sse2, epel, h6)
|
||||
TAP_W16(sse2, epel, v6)
|
||||
@@ -159,14 +130,6 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
|
||||
dst, dststride, tmpptr, SIZE, height, mx, my); \
|
||||
}
|
||||
|
||||
#define HVTAPMMX(x, y) \
|
||||
HVTAP(mmxext, 8, x, y, 4, 8)
|
||||
|
||||
HVTAPMMX(4, 4)
|
||||
HVTAPMMX(4, 6)
|
||||
HVTAPMMX(6, 4)
|
||||
HVTAPMMX(6, 6)
|
||||
|
||||
#define HVTAPSSE2(x, y, w) \
|
||||
HVTAP(sse2, 16, x, y, w, 16) \
|
||||
HVTAP(ssse3, 16, x, y, w, 16)
|
||||
@@ -194,7 +157,6 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
|
||||
dst, dststride, tmp, SIZE, height, mx, my); \
|
||||
}
|
||||
|
||||
HVBILIN(mmxext, 8, 4, 8)
|
||||
HVBILIN(sse2, 8, 8, 16)
|
||||
HVBILIN(sse2, 8, 16, 16)
|
||||
HVBILIN(ssse3, 8, 4, 8)
|
||||
@@ -280,30 +242,22 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(cpu_flags)) {
|
||||
c->put_vp8_epel_pixels_tab[1][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
|
||||
}
|
||||
|
||||
/* note that 4-tap width=16 functions are missing because w=16
|
||||
* is only used for luma, and luma is always a copy or sixtap. */
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
VP8_MC_FUNC(2, 4, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
c->put_vp8_epel_pixels_tab[0][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
|
||||
c->put_vp8_epel_pixels_tab[1][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_sse2;
|
||||
VP8_LUMA_MC_FUNC(0, 16, sse2);
|
||||
VP8_MC_FUNC(1, 8, sse2);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
|
||||
}
|
||||
|
||||
/* note that 4-tap width=16 functions are missing because w=16
|
||||
* is only used for luma, and luma is always a copy or sixtap. */
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
VP8_LUMA_MC_FUNC(0, 16, ssse3);
|
||||
VP8_MC_FUNC(1, 8, ssse3);
|
||||
|
||||
@@ -27,10 +27,8 @@
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavutil/frame.h"
|
||||
#include "libavutil/opt.h"
|
||||
#include "audio.h"
|
||||
#include "avfilter.h"
|
||||
#include "filters.h"
|
||||
#include "video.h"
|
||||
|
||||
enum SideDataMode {
|
||||
SIDEDATA_SELECT,
|
||||
@@ -96,6 +94,31 @@ static av_cold int init(AVFilterContext *ctx)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int config_props(AVFilterLink *outlink)
|
||||
{
|
||||
AVFilterContext *ctx = outlink->src;
|
||||
SideDataContext *s = ctx->priv;
|
||||
const AVFrameSideData *sd = NULL;
|
||||
|
||||
if (s->type != -1)
|
||||
sd = av_frame_side_data_get(outlink->side_data, outlink->nb_side_data, s->type);
|
||||
|
||||
switch (s->mode) {
|
||||
case SIDEDATA_SELECT:
|
||||
break;
|
||||
case SIDEDATA_DELETE:
|
||||
if (s->type == -1)
|
||||
av_frame_side_data_free(&outlink->side_data, &outlink->nb_side_data);
|
||||
else if (sd)
|
||||
av_frame_side_data_remove(&outlink->side_data, &outlink->nb_side_data, s->type);
|
||||
break;
|
||||
default:
|
||||
av_assert0(0);
|
||||
};
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
|
||||
{
|
||||
AVFilterContext *ctx = inlink->dst;
|
||||
@@ -143,6 +166,14 @@ static const AVFilterPad ainputs[] = {
|
||||
},
|
||||
};
|
||||
|
||||
static const AVFilterPad aoutputs[] = {
|
||||
{
|
||||
.name = "default",
|
||||
.type = AVMEDIA_TYPE_AUDIO,
|
||||
.config_props = config_props,
|
||||
},
|
||||
};
|
||||
|
||||
const FFFilter ff_af_asidedata = {
|
||||
.p.name = "asidedata",
|
||||
.p.description = NULL_IF_CONFIG_SMALL("Manipulate audio frame side data."),
|
||||
@@ -152,7 +183,7 @@ const FFFilter ff_af_asidedata = {
|
||||
.priv_size = sizeof(SideDataContext),
|
||||
.init = init,
|
||||
FILTER_INPUTS(ainputs),
|
||||
FILTER_OUTPUTS(ff_audio_default_filterpad),
|
||||
FILTER_OUTPUTS(aoutputs),
|
||||
};
|
||||
#endif /* CONFIG_ASIDEDATA_FILTER */
|
||||
|
||||
@@ -169,6 +200,14 @@ static const AVFilterPad inputs[] = {
|
||||
},
|
||||
};
|
||||
|
||||
static const AVFilterPad outputs[] = {
|
||||
{
|
||||
.name = "default",
|
||||
.type = AVMEDIA_TYPE_VIDEO,
|
||||
.config_props = config_props,
|
||||
},
|
||||
};
|
||||
|
||||
const FFFilter ff_vf_sidedata = {
|
||||
.p.name = "sidedata",
|
||||
.p.description = NULL_IF_CONFIG_SMALL("Manipulate video frame side data."),
|
||||
@@ -178,6 +217,6 @@ const FFFilter ff_vf_sidedata = {
|
||||
.priv_size = sizeof(SideDataContext),
|
||||
.init = init,
|
||||
FILTER_INPUTS(inputs),
|
||||
FILTER_OUTPUTS(ff_video_default_filterpad),
|
||||
FILTER_OUTPUTS(outputs),
|
||||
};
|
||||
#endif /* CONFIG_SIDEDATA_FILTER */
|
||||
|
||||
@@ -7867,8 +7867,11 @@ static int mov_init_iamf_track(AVFormatContext *s)
|
||||
default:
|
||||
av_assert0(0);
|
||||
}
|
||||
if (ret < 0)
|
||||
if (ret < 0) {
|
||||
ff_iamf_uninit_context(iamf);
|
||||
av_free(iamf);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
track = &mov->tracks[first_iamf_idx];
|
||||
|
||||
@@ -510,7 +510,8 @@ static void checkasm_check_vp78dsp(VP8DSPContext *d, bool is_vp7)
|
||||
|
||||
void checkasm_check_vp8dsp(void)
|
||||
{
|
||||
VP8DSPContext d;
|
||||
// Needs to be zeroed because not all size 16 epel functions exist.
|
||||
VP8DSPContext d = { 0 };
|
||||
|
||||
ff_vp78dsp_init(&d);
|
||||
check_mc(&d);
|
||||
|
||||
Reference in New Issue
Block a user