avcodec/x86/h264_idct: Fix ff_h264_luma_dc_dequant_idct_sse2 checkasm failures

ff_h264_luma_dc_dequant_idct_sse2() does not pass checkasm for certain seeds, because the input to packssdw no longer fits into an int16_t, leading to saturation, where the C code just truncates. I don't know whether the spec contains provisions that ensure that valid input must not exceed 16 bit or whether the such inputs (even if invalid) can be triggered by the actual code and not only the test. This commit adapts the behavior of the function to the C reference code to fix the test. packssdw is avoided, instead the lower words are directly transfered to GPRs to be written out. This has unfortunately led to a slight performance regression here (14.5 vs 15.1 cycles). Fixes issue #20835. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
avcodec/x86/h264_idct: Deduplicate generating constant
2025-12-05 14:30:00 +01:00 · 2025-11-30 00:15:43 +01:00 · 2025-11-30 00:15:43 +01:00 · 2025-11-30 00:15:43 +01:00 · 2025-11-30 00:15:43 +01:00 · 2025-11-30 00:15:43 +01:00
4 changed files with 134 additions and 190 deletions
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -51,11 +51,23 @@ scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
 %endif

 cextern pw_32
-cextern pw_1

 SECTION .text

-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
+; %1=callee, %2=dst to jump to if tail call is impossible (can be empty,
+; then no jmp is performed), %3=current iteration, %4=last iteration
+%macro TAIL_CALL_IF_LAST 4
+%if (%3 == %4) && !has_epilogue
+    jmp         %1
+%else
+    call        %1
+    %ifnempty %2
+        jmp      %2
+    %endif
+%endif
+%endmacro
+
+; %1=uint8_t *dst, %2=int16_t *block, %3=ptrdiff_t stride
 %macro IDCT4_ADD 3
    ; Load dct coeffs
    movq         m0, [%2]
@@ -77,10 +89,15 @@ SECTION .text
    paddw        m0, m6
    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
    pxor         m7, m7
-    movq    [%2+ 0], m7
-    movq    [%2+ 8], m7
-    movq    [%2+16], m7
-    movq    [%2+24], m7
+    %if mmsize == 16
+        mova    [%2+ 0], m7
+        mova    [%2+16], m7
+    %else
+        movq    [%2+ 0], m7
+        movq    [%2+ 8], m7
+        movq    [%2+16], m7
+        movq    [%2+24], m7
+    %endif

    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
    lea          %1, [%1+%3*2]
@@ -145,62 +162,7 @@ SECTION .text
    IDCT8_1D   [%1], [%1+ 64]
 %endmacro

-; %1=int16_t *block, %2=int16_t *dstblock
-%macro IDCT8_ADD_MMX_START 2
-    IDCT8_1D_FULL %1
-    mova       [%1], m7
-    TRANSPOSE4x4W 0, 1, 2, 3, 7
-    mova         m7, [%1]
-    mova    [%2   ], m0
-    mova    [%2+16], m1
-    mova    [%2+32], m2
-    mova    [%2+48], m3
-    TRANSPOSE4x4W 4, 5, 6, 7, 3
-    mova    [%2+ 8], m4
-    mova    [%2+24], m5
-    mova    [%2+40], m6
-    mova    [%2+56], m7
-%endmacro
-
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
-%macro IDCT8_ADD_MMX_END 3-4
-    IDCT8_1D_FULL %2
-    mova    [%2   ], m5
-    mova    [%2+16], m6
-    mova    [%2+32], m7
-
-    pxor         m7, m7
-%if %0 == 4
-    movq   [%4+  0], m7
-    movq   [%4+  8], m7
-    movq   [%4+ 16], m7
-    movq   [%4+ 24], m7
-    movq   [%4+ 32], m7
-    movq   [%4+ 40], m7
-    movq   [%4+ 48], m7
-    movq   [%4+ 56], m7
-    movq   [%4+ 64], m7
-    movq   [%4+ 72], m7
-    movq   [%4+ 80], m7
-    movq   [%4+ 88], m7
-    movq   [%4+ 96], m7
-    movq   [%4+104], m7
-    movq   [%4+112], m7
-    movq   [%4+120], m7
-%endif
-    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
-    lea          %1, [%1+%3*2]
-    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
-    mova         m0, [%2   ]
-    mova         m1, [%2+16]
-    mova         m2, [%2+32]
-    lea          %1, [%1+%3*2]
-    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
-    lea          %1, [%1+%3*2]
-    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
-%endmacro
-
-; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
+; %1=uint8_t *dst, %2=int16_t *block, %3=ptrdiff_t stride
 %macro IDCT8_ADD_SSE 4
    IDCT8_1D_FULL %2
 %if ARCH_X86_64
@@ -371,30 +333,6 @@ INIT_XMM cpuname
    RET

 INIT_MMX mmx
-h264_idct_add8_mmx_plane:
-    movsxdifnidn r3, r3d
-.nextblock:
-    movzx        r6, byte [scan8+r5]
-    movzx        r6, byte [r4+r6]
-    or          r6w, word [r2]
-    test         r6, r6
-    jz .skipblock
-%if ARCH_X86_64
-    mov         r0d, dword [r1+r5*4]
-    add          r0, [dst2q]
-%else
-    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
-    mov          r0, [r0]
-    add          r0, dword [r1+r5*4]
-%endif
-    IDCT4_ADD    r0, r2, r3
-.skipblock:
-    inc          r5
-    add          r2, 32
-    test         r5, 3
-    jnz .nextblock
-    rep ret
-
 cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
 ; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
    movsxdifnidn r3, r3d
@@ -423,13 +361,34 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, str

    call         h264_idct_add8_mmx_plane
    add r5, 4
-    call         h264_idct_add8_mmx_plane
+    TAIL_CALL    h264_idct_add8_mmx_plane, 0

-    RET ; TODO: check rep ret after a function call
+h264_idct_add8_mmx_plane:
+.nextblock:
+    movzx       r6d, byte [scan8+r5]
+    movzx       r6d, byte [r4+r6]
+    or          r6w, word [r2]
+    test        r6d, r6d
+    jz .skipblock
+%if ARCH_X86_64
+    mov         r0d, dword [r1+r5*4]
+    add          r0, [dst2q]
+%else
+    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
+    mov          r0, [r0]
+    add          r0, dword [r1+r5*4]
+%endif
+    IDCT4_ADD    r0, r2, r3
+.skipblock:
+    inc         r5d
+    add          r2, 32
+    test        r5d, 3
+    jnz .nextblock
+    rep ret

-; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
+
+; r0 = uint8_t *dst, r2 = int16_t *block, r3 = ptrdiff_t stride, r6=clobbered
 h264_idct_dc_add8_mmxext:
-    movsxdifnidn r3, r3d
    movd         m0, [r2   ]          ;  0 0 X D
    mov word [r2+ 0], 0
    punpcklwd    m0, [r2+32]          ;  x X d D
@@ -448,9 +407,8 @@ h264_idct_dc_add8_mmxext:

 ALIGN 16
 INIT_XMM sse2
-; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
+; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = ptrdiff_t stride
 h264_add8x4_idct_sse2:
-    movsxdifnidn r3, r3d
    movq   m0, [r2+ 0]
    movq   m1, [r2+ 8]
    movq   m2, [r2+16]
@@ -483,7 +441,7 @@ h264_add8x4_idct_sse2:
 %else
    add         r0, r0m
 %endif
-    call        h264_add8x4_idct_sse2
+    TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, , %1, 7
 .cycle%1end:
 %if %1 < 7
    add         r2, 64
@@ -520,8 +478,7 @@ RET
 %else
    add         r0, r0m
 %endif
-    call        h264_add8x4_idct_sse2
-    jmp .cycle%1end
+    TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, .cycle%1end, %1, 7
 .try%1dc:
    movsx       r0, word [r2   ]
    or         r0w, word [r2+32]
@@ -532,7 +489,7 @@ RET
 %else
    add         r0, r0m
 %endif
-    call        h264_idct_dc_add8_mmxext
+    TAIL_CALL_IF_LAST h264_idct_dc_add8_mmxext, , %1, 7
 .cycle%1end:
 %if %1 < 7
    add         r2, 64
@@ -569,8 +526,7 @@ RET
    mov         r0, [r0]
    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 %endif
-    call        h264_add8x4_idct_sse2
-    jmp .cycle%1end
+    TAIL_CALL_IF_LAST h264_add8x4_idct_sse2, .cycle%1end, %1, 3
 .try%1dc:
    movsx       r0, word [r2   ]
    or         r0w, word [r2+32]
@@ -583,7 +539,7 @@ RET
    mov         r0, [r0]
    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
 %endif
-    call        h264_idct_dc_add8_mmxext
+    TAIL_CALL_IF_LAST h264_idct_dc_add8_mmxext, , %1, 3
 .cycle%1end:
 %if %1 == 1
    add         r2, 384+64
@@ -612,7 +568,7 @@ cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
    add8_sse2_cycle 3, 0x64
 RET

-;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
+;void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul)

 %macro WALSH4_1D 5
    SUMSUB_BADC w, %4, %3, %2, %1, %5
@@ -620,111 +576,92 @@ RET
    SWAP %1, %4, %3
 %endmacro

-%macro DEQUANT 1-3
-%if cpuflag(sse2)
-    movd      xmm4, t3d
-    movq      xmm5, [pw_1]
-    pshufd    xmm4, xmm4, 0
-    movq2dq   xmm0, m0
-    movq2dq   xmm1, m1
-    movq2dq   xmm2, m2
-    movq2dq   xmm3, m3
-    punpcklwd xmm0, xmm5
-    punpcklwd xmm1, xmm5
-    punpcklwd xmm2, xmm5
-    punpcklwd xmm3, xmm5
-    pmaddwd   xmm0, xmm4
-    pmaddwd   xmm1, xmm4
-    pmaddwd   xmm2, xmm4
-    pmaddwd   xmm3, xmm4
-    psrad     xmm0, %1
-    psrad     xmm1, %1
-    psrad     xmm2, %1
-    psrad     xmm3, %1
-    packssdw  xmm0, xmm1
-    packssdw  xmm2, xmm3
-%else
-    mova        m7, [pw_1]
-    mova        m4, %1
-    punpcklwd   %1, m7
-    punpckhwd   m4, m7
-    mova        m5, %2
-    punpcklwd   %2, m7
-    punpckhwd   m5, m7
-    movd        m7, t3d
-    punpckldq   m7, m7
-    pmaddwd     %1, m7
-    pmaddwd     %2, m7
-    pmaddwd     m4, m7
-    pmaddwd     m5, m7
-    psrad       %1, %3
-    psrad       %2, %3
-    psrad       m4, %3
-    psrad       m5, %3
-    packssdw    %1, m4
-    packssdw    %2, m5
-%endif
+; requires m5 to contain pw_1
+%macro DEQUANT 1
+    movd        m4, t3d
+    pshufd      m4, m4, 0
+    punpcklwd   m0, m5
+    punpcklwd   m1, m5
+    punpcklwd   m2, m5
+    punpcklwd   m3, m5
+    pmaddwd     m0, m4
+    pmaddwd     m1, m4
+    pmaddwd     m2, m4
+    pmaddwd     m3, m4
+    psrad       m0, %1
+    psrad       m1, %1
+    psrad       m2, %1
+    psrad       m3, %1
 %endmacro

-%macro STORE_WORDS 5-9
-%if cpuflag(sse)
-    movd  t0d, %1
-    psrldq  %1, 4
-    movd  t1d, %1
-    psrldq  %1, 4
-    mov [t2+%2*32], t0w
-    mov [t2+%4*32], t1w
-    shr   t0d, 16
-    shr   t1d, 16
+%macro STORE_WORDS 10
+%if ARCH_X86_64
+    movq        t0, %1
+    movq        t1, %2
+    psrldq      %1, 8
+    psrldq      %2, 8
    mov [t2+%3*32], t0w
-    mov [t2+%5*32], t1w
-    movd  t0d, %1
-    psrldq  %1, 4
-    movd  t1d, %1
-    mov [t2+%6*32], t0w
+    mov [t2+%7*32], t1w
+    shr         t0, 32
+    shr         t1, 32
+    mov [t2+%4*32], t0w
    mov [t2+%8*32], t1w
-    shr   t0d, 16
-    shr   t1d, 16
-    mov [t2+%7*32], t0w
+    movq        t0, %1
+    movq        t1, %2
+    mov [t2+%5*32], t0w
    mov [t2+%9*32], t1w
+    shr         t0, 32
+    shr         t1, 32
+    mov [t2+%6*32], t0w
+    mov [t2+%10*32], t1w
 %else
-    movd  t0d, %1
-    psrlq  %1, 32
-    movd  t1d, %1
-    mov [t2+%2*32], t0w
-    mov [t2+%4*32], t1w
-    shr   t0d, 16
-    shr   t1d, 16
+    movd       t0d, %1
+    movd       t1d, %2
+    psrldq      %1, 4
+    psrldq      %2, 4
    mov [t2+%3*32], t0w
-    mov [t2+%5*32], t1w
+    mov [t2+%7*32], t1w
+    movd       t0d, %1
+    movd       t1d, %2
+    psrldq      %1, 4
+    psrldq      %2, 4
+    mov [t2+%4*32], t0w
+    mov [t2+%8*32], t1w
+    movd       t0d, %1
+    movd       t1d, %2
+    psrldq      %1, 4
+    psrldq      %2, 4
+    mov [t2+%5*32], t0w
+    mov [t2+%9*32], t1w
+    movd       t0d, %1
+    movd       t1d, %2
+    mov [t2+%6*32], t0w
+    mov [t2+%10*32], t1w
 %endif
 %endmacro

 %macro DEQUANT_STORE 1
-%if cpuflag(sse2)
    DEQUANT     %1
-    STORE_WORDS xmm0,  0,  1,  4,  5,  2,  3,  6,  7
-    STORE_WORDS xmm2,  8,  9, 12, 13, 10, 11, 14, 15
-%else
-    DEQUANT     m0, m1, %1
-    STORE_WORDS m0,  0,  1,  4,  5
-    STORE_WORDS m1,  2,  3,  6,  7
-
-    DEQUANT     m2, m3, %1
-    STORE_WORDS m2,  8,  9, 12, 13
-    STORE_WORDS m3, 10, 11, 14, 15
-%endif
+    STORE_WORDS m0, m1,  0,  1,  4,  5,  2,  3,  6,  7
+    STORE_WORDS m2, m3,  8,  9, 12, 13, 10, 11, 14, 15
 %endmacro

 INIT_XMM sse2
 cglobal h264_luma_dc_dequant_idct, 3, 4, 7
-INIT_MMX cpuname
    movq        m3, [r1+24]
    movq        m2, [r1+16]
    movq        m1, [r1+ 8]
    movq        m0, [r1+ 0]
    WALSH4_1D    0,1,2,3,4
-    TRANSPOSE4x4W 0,1,2,3,4
+    punpcklwd   m0, m1
+    punpcklwd   m2, m3
+    mova        m4, m0
+    pcmpeqw     m5, m5
+    punpckldq   m0, m2
+    punpckhdq   m4, m2
+    movhlps     m1, m0
+    movhlps     m3, m4
+    SWAP 2, 4
    WALSH4_1D    0,1,2,3,4

 ; shift, tmp, output, qmul
@@ -737,6 +674,7 @@ INIT_MMX cpuname
 %else
    DECLARE_REG_TMP 1,3,0,2
 %endif
+    psrlw       m5, 15

    cmp        t3d, 32767
    jg .big_qmul
@@ -752,8 +690,8 @@ INIT_MMX cpuname
    inc        t1d
    shr        t3d, t0b
    sub        t1d, t0d
-    movd      xmm6, t1d
-    DEQUANT_STORE xmm6
+    movd        m6, t1d
+    DEQUANT_STORE m6
    RET

 %ifdef __NASM_VER__
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -609,7 +609,7 @@ DECLARE_REG 14, R13, 120
    RESET_STACK_STATE
 %endmacro

-%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
+%define has_epilogue (regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs)

 %macro RET 0
    WIN64_RESTORE_XMM_INTERNAL
@@ -658,7 +658,7 @@ DECLARE_REG 14, R13, 72
    %endif
 %endmacro

-%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
+%define has_epilogue (regs_used > 9 || stack_size > 0 || vzeroupper_required)

 %macro RET 0
    %if stack_size_padded > 0
@@ -722,7 +722,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
    %endif
 %endmacro

-%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
+%define has_epilogue (regs_used > 3 || stack_size > 0 || vzeroupper_required)

 %macro RET 0
    %if stack_size_padded > 0
--- a/tests/checkasm/h264dsp.c
+++ b/tests/checkasm/h264dsp.c
@@ -336,7 +336,7 @@ static void check_idct_dequant(void)
    LOCAL_ALIGNED_16(int32_t, dst1_32, [16 * 16]);
    H264DSPContext h;
    int bit_depth, i, qmul;
-    declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_SSE2, void, int16_t *output, int16_t *input, int qmul);
+    declare_func(void, int16_t *output, int16_t *input, int qmul);

    qmul = rnd() % 4096;

--- a/tools/sofa2wavs.c
+++ b/tools/sofa2wavs.c
@@ -24,6 +24,12 @@
 #include <stdio.h>
 #include <mysofa.h>

+#ifdef _WIN32
+#include <direct.h>
+#undef mkdir
+#define mkdir(a, b) _mkdir(a)
+#endif
+
 int main(int argc, char **argv)
 {
    struct MYSOFA_HRTF *hrtf;
Author	SHA1	Message	Date
Andreas Rheinhardt	89f984e3d1	avcodec/x86/h264_idct: Fix ff_h264_luma_dc_dequant_idct_sse2 checkasm failures ff_h264_luma_dc_dequant_idct_sse2() does not pass checkasm for certain seeds, because the input to packssdw no longer fits into an int16_t, leading to saturation, where the C code just truncates. I don't know whether the spec contains provisions that ensure that valid input must not exceed 16 bit or whether the such inputs (even if invalid) can be triggered by the actual code and not only the test. This commit adapts the behavior of the function to the C reference code to fix the test. packssdw is avoided, instead the lower words are directly transfered to GPRs to be written out. This has unfortunately led to a slight performance regression here (14.5 vs 15.1 cycles). Fixes issue #20835. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-30 00:15:43 +01:00
Andreas Rheinhardt	e6ae2802a3	avcodec/x86/h264_idct: Deduplicate generating constant pw_1 is currently loaded in both codepaths. Generate it earlier instead. Gives tiny speedups (15 vs 14.5 cycles) and reduces codesize. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-30 00:15:43 +01:00
Andreas Rheinhardt	ada0a81577	avcodec/x86/h264_idct: Don't use MMX registers in ff_h264_luma_dc_dequant_idct_sse2 It is ABI compliant and gives a tiny speedup here (and is 16B smaller). Old benchmarks: h264_luma_dc_dequant_idct_8_c: 33.2 ( 1.00x) h264_luma_dc_dequant_idct_8_sse2: 16.0 ( 2.07x) New benchmarks: h264_luma_dc_dequant_idct_8_c: 33.0 ( 1.00x) h264_luma_dc_dequant_idct_8_sse2: 15.0 ( 2.20x) Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-30 00:15:43 +01:00
Andreas Rheinhardt	012c25bac4	avcodec/x86/h264_idct: Zero with full-width stores Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-30 00:15:43 +01:00
Andreas Rheinhardt	b9cbbd9074	avcodec/x86/h264_idct: Use tail call where advantageous It is possible on UNIX64. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-30 00:15:43 +01:00
Andreas Rheinhardt	0ec9c1b68d	avutil/x86/x86inc: Use parentheses in has_epilogue Prevents surprises. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-30 00:15:43 +01:00
Andreas Rheinhardt	01ff05e4bc	avcodec/x86/h264_idct: Avoid call where possible Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-30 00:15:43 +01:00
Andreas Rheinhardt	b51cbd4116	avcodec/x86/h264_idct: Remove redundant movsxdifnidn Only exported (i.e. cglobal) functions need it; stride is already sign-extended when it reaches any of the internal functions used here, so don't sign-extend again. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-30 00:15:43 +01:00
Andreas Rheinhardt	18019f177e	avcodec/x86/h264idct: Remove dead MMX macros Forgotten in `4618f36a24`. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2025-11-30 00:15:43 +01:00
Kacper Michajłow	9cd4be6d7c	tools/sofa2wavs: fix build on Windows Signed-off-by: Kacper Michajłow <kasper93@gmail.com>	2025-11-29 21:43:12 +00:00