Files
ffmpeg/libswscale/aarch64/asm-offsets.h
Arpad Panyik 1f30ff30fb swscale: Add AArch64 Neon path for xyz12Torgb48 LE
Add optimized Neon code path for the little endian case of the
xyz12Torgb48 function. The innermost loop processes the data in 4x2
pixel blocks using software gathers with the matrix multiplication
and clipping done by Neon.

Relative runtime of micro benchmarks after this patch on some
Cortex and Neoverse CPU cores:

 xyz12le_rgb48le    X1      X3      X4    X925      V2
 16x4_neon:       2.55x   4.34x   3.84x   3.31x   3.22x
 32x4_neon:       2.39x   3.63x   3.22x   3.35x   3.29x
 64x4_neon:       2.37x   3.31x   2.91x   3.33x   3.27x
 128x4_neon:      2.34x   3.28x   2.91x   3.35x   3.24x
 256x4_neon:      2.30x   3.17x   2.91x   3.32x   3.10x
 512x4_neon:      2.26x   3.10x   2.91x   3.30x   3.07x
 1024x4_neon:     2.26x   3.07x   2.96x   3.30x   3.05x
 1920x4_neon:     2.26x   3.06x   2.93x   3.28x   3.04x

 xyz12le_rgb48le   A76     A78    A715    A720    A725
 16x4_neon:       2.33x   2.28x   2.53x   3.33x   3.19x
 32x4_neon:       2.35x   2.18x   2.45x   3.23x   3.24x
 64x4_neon:       2.35x   2.16x   2.42x   3.15x   3.21x
 128x4_neon:      2.35x   2.13x   2.39x   3.00x   3.09x
 256x4_neon:      2.36x   2.12x   2.35x   2.85x   2.99x
 512x4_neon:      2.35x   2.14x   2.35x   2.78x   2.95x
 1024x4_neon:     2.31x   2.09x   2.33x   2.80x   2.91x
 1920x4_neon:     2.30x   2.07x   2.32x   2.81x   2.94x

 xyz12le_rgb48le   A55    A510    A520
 16x4_neon:       2.09x   1.92x   2.36x
 32x4_neon:       2.05x   1.89x   2.38x
 64x4_neon:       2.02x   1.77x   2.35x
 128x4_neon:      1.96x   1.74x   2.25x
 256x4_neon:      1.90x   1.72x   2.19x
 512x4_neon:      1.83x   1.75x   2.16x
 1024x4_neon:     1.83x   1.62x   2.15x
 1920x4_neon:     1.82x   1.60x   2.15x

Signed-off-by: Arpad Panyik <Arpad.Panyik@arm.com>
2025-12-05 10:28:18 +00:00

37 lines
1.2 KiB
C

/*
* Copyright (c) 2025 Arpad Panyik <Arpad.Panyik@arm.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_AARCH64_ASM_OFFSETS_H
#define SWSCALE_AARCH64_ASM_OFFSETS_H
/* SwsLuts */
#define SL_IN 0x00
#define SL_OUT 0x08
/* SwsColorXform */
#define SCX_GAMMA 0x00
#define SCX_MAT 0x10
#define SCX_GAMMA_IN (SCX_GAMMA + SL_IN)
#define SCX_GAMMA_OUT (SCX_GAMMA + SL_OUT)
#define SCX_MAT_00 SCX_MAT
#define SCX_MAT_22 (SCX_MAT + 8 * 2)
#endif /* SWSCALE_AARCH64_ASM_OFFSETS_H */