avfilter/vf_fsppdsp: Use restrict

It is possible because the requirements are fulfilled;
it is also beneficial performance and code-size wise.
For GCC 14 (with -O3), this reduced codesize by 26750B
here; for Clang 20, it was 432B.

Old benchmarks:
mul_thrmat_c:                                            4.3 ( 1.00x)
mul_thrmat_sse2:                                         4.3 ( 1.00x)
store_slice_c:                                        2810.8 ( 1.00x)
store_slice_sse2:                                      542.5 ( 5.18x)
store_slice2_c:                                       3817.0 ( 1.00x)
store_slice2_sse2:                                     410.4 ( 9.30x)

New benchmarks:
mul_thrmat_c:                                            4.3 ( 1.00x)
mul_thrmat_sse2:                                         4.3 ( 1.00x)
store_slice_c:                                        1510.1 ( 1.00x)
store_slice_sse2:                                      545.2 ( 2.77x)
store_slice2_c:                                       1763.5 ( 1.00x)
store_slice2_sse2:                                     408.3 ( 4.32x)

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-11-10 23:03:23 +01:00
parent ff85a20b7d
commit d19050a1ae
2 changed files with 26 additions and 20 deletions

View File

@@ -64,7 +64,7 @@ DECLARE_ALIGNED(8, const uint8_t, ff_fspp_dither)[8][8] = {
};
//This func reads from 1 slice, 1 and clears 0 & 1
void ff_store_slice_c(uint8_t *dst, int16_t *src,
void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
@@ -93,7 +93,7 @@ void ff_store_slice_c(uint8_t *dst, int16_t *src,
}
//This func reads from 2 slices, 0 & 2 and clears 2-nd
void ff_store_slice2_c(uint8_t *dst, int16_t *src,
void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
{
@@ -121,13 +121,14 @@ void ff_store_slice2_c(uint8_t *dst, int16_t *src,
}
}
void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q)
{
for (int a = 0; a < 64; a++)
thr_adr[a] = q * thr_adr_noq[a];
}
void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
int16_t *restrict output, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -249,7 +250,8 @@ void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt
}
}
void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -311,7 +313,8 @@ void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_str
}
}
void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
ptrdiff_t line_size, int cnt)
{
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int_simd16_t tmp10, tmp11, tmp12, tmp13;

View File

@@ -31,40 +31,43 @@
#include "libavutil/attributes_internal.h"
typedef struct FSPPDSPContext {
void (*store_slice)(uint8_t *dst, int16_t *src /* align 16 */,
void (*store_slice)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void (*store_slice2)(uint8_t *dst, int16_t *src /* align 16 */,
void (*store_slice2)(uint8_t *restrict dst, int16_t *restrict src /* align 16 */,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void (*mul_thrmat)(int16_t *thr_adr_noq /* align 16 */,
int16_t *thr_adr /* align 16 */, int q);
void (*mul_thrmat)(int16_t *restrict thr_adr_noq /* align 16 */,
int16_t *restrict thr_adr /* align 16 */, int q);
void (*column_fidct)(int16_t *thr_adr, int16_t *data,
int16_t *output, int cnt);
void (*column_fidct)(int16_t *restrict thr_adr, int16_t *data,
int16_t *restrict output, int cnt);
void (*row_idct)(int16_t *workspace, int16_t *output_adr,
void (*row_idct)(int16_t *restrict workspace, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt);
void (*row_fdct)(int16_t *data, const uint8_t *pixels,
void (*row_fdct)(int16_t *restrict data, const uint8_t *restrict pixels,
ptrdiff_t line_size, int cnt);
} FSPPDSPContext;
FF_VISIBILITY_PUSH_HIDDEN
extern const uint8_t ff_fspp_dither[8][8];
void ff_store_slice_c(uint8_t *dst, int16_t *src,
void ff_store_slice_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_store_slice2_c(uint8_t *dst, int16_t *src,
void ff_store_slice2_c(uint8_t *restrict dst, int16_t *restrict src,
ptrdiff_t dst_stride, ptrdiff_t src_stride,
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
void ff_mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
void ff_column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
void ff_row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
void ff_row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
void ff_mul_thrmat_c(int16_t *restrict thr_adr_noq, int16_t *restrict thr_adr, int q);
void ff_column_fidct_c(int16_t *restrict thr_adr, int16_t *restrict data,
int16_t *restrict output, int cnt);
void ff_row_idct_c(int16_t *restrict workspace, int16_t *restrict output_adr,
ptrdiff_t output_stride, int cnt);
void ff_row_fdct_c(int16_t *restrict data, const uint8_t *restrict pixels,
ptrdiff_t line_size, int cnt);
void ff_fsppdsp_init_x86(FSPPDSPContext *fspp);
FF_VISIBILITY_POP_HIDDEN