From dcb28ed860166c9715afb7c71c70889e6b9b8c8d Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Sun, 12 Oct 2025 17:19:26 +0200 Subject: [PATCH] avfilter/x86/vf_spp: Port store_slice to SSE2 This allows to remove an emms_c from the filter. It also gives 25% speedup here (when timing the calls to store_slice using START/STOP_TIMER). Signed-off-by: Andreas Rheinhardt --- libavfilter/vf_spp.c | 2 -- libavfilter/x86/vf_spp.c | 49 ++++++++++++++++++++-------------------- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/libavfilter/vf_spp.c b/libavfilter/vf_spp.c index 5c5b98f8db..20c9fd4340 100644 --- a/libavfilter/vf_spp.c +++ b/libavfilter/vf_spp.c @@ -31,7 +31,6 @@ * ported by Clément Bœsch for FFmpeg. */ -#include "libavutil/emms.h" #include "libavutil/imgutils.h" #include "libavutil/mem.h" #include "libavutil/mem_internal.h" @@ -425,7 +424,6 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) filter(s, out->data[1], in->data[1], out->linesize[1], in->linesize[1], cw, ch, qp_table, qp_stride, 0, depth); filter(s, out->data[2], in->data[2], out->linesize[2], in->linesize[2], cw, ch, qp_table, qp_stride, 0, depth); } - emms_c(); } } diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c index f8e5727bfc..48c3d25d7c 100644 --- a/libavfilter/x86/vf_spp.c +++ b/libavfilter/x86/vf_spp.c @@ -18,16 +18,20 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ +#include +#include "config.h" #include "libavutil/attributes.h" #include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" #include "libavfilter/vf_spp.h" -#if HAVE_MMX_INLINE -static void store_slice_mmx(uint8_t *dst, const int16_t *src, - int dst_stride, int src_stride, - int width, int height, int log2_scale, - const uint8_t dither[8][8]) +#if HAVE_SSE2_INLINE +static void store_slice_sse2(uint8_t *dst, const int16_t *src, + int dst_stride, int src_stride, + int width, int height, int log2_scale, + const uint8_t dither[8][8]) { int y; @@ -35,30 +39,25 @@ static void store_slice_mmx(uint8_t *dst, const int16_t *src, uint8_t *dst1 = dst; const int16_t *src1 = src; __asm__ volatile( - "movq (%3), %%mm3 \n" - "movq (%3), %%mm4 \n" - "movd %4, %%mm2 \n" - "pxor %%mm0, %%mm0 \n" - "punpcklbw %%mm0, %%mm3 \n" - "punpckhbw %%mm0, %%mm4 \n" - "psraw %%mm2, %%mm3 \n" - "psraw %%mm2, %%mm4 \n" - "movd %5, %%mm2 \n" + "movq (%3), %%xmm1 \n" + "movd %4, %%xmm2 \n" + "pxor %%xmm0, %%xmm0 \n" + "punpcklbw %%xmm0, %%xmm1 \n" + "psraw %%xmm2, %%xmm1 \n" + "movd %5, %%xmm2 \n" "1: \n" - "movq (%0), %%mm0 \n" - "movq 8(%0), %%mm1 \n" - "paddw %%mm3, %%mm0 \n" - "paddw %%mm4, %%mm1 \n" - "psraw %%mm2, %%mm0 \n" - "psraw %%mm2, %%mm1 \n" - "packuswb %%mm1, %%mm0 \n" - "movq %%mm0, (%1) \n" + "movdqa (%0), %%xmm0 \n" + "paddw %%xmm1, %%xmm0 \n" + "psraw %%xmm2, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0, (%1) \n" "add $16, %0 \n" "add $8, %1 \n" "cmp %2, %1 \n" " jb 1b \n" : "+r" (src1), "+r"(dst1) : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2") ); src += src_stride; dst += dst_stride; @@ -69,11 +68,11 @@ static void store_slice_mmx(uint8_t *dst, const int16_t *src, av_cold void ff_spp_init_x86(SPPContext *s) { -#if HAVE_MMX_INLINE +#if HAVE_SSE2_INLINE int cpu_flags = av_get_cpu_flags(); - if (cpu_flags & AV_CPU_FLAG_MMX) { - s->store_slice = store_slice_mmx; + if (INLINE_SSE2(cpu_flags)) { + s->store_slice = store_slice_sse2; } #endif }