diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index 1de8c9c0d6..1c82e34e28 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -8,4 +8,5 @@ NEON-OBJS   += aarch64/hscale.o                 \
                aarch64/range_convert_neon.o     \
                aarch64/rgb2rgb_neon.o           \
                aarch64/swscale_unscaled_neon.o  \
+               aarch64/xyz2rgb_neon.o           \
                aarch64/yuv2rgb_neon.o           \
diff --git a/libswscale/aarch64/asm-offsets.h b/libswscale/aarch64/asm-offsets.h
new file mode 100644
index 0000000000..110389c965
--- /dev/null
+++ b/libswscale/aarch64/asm-offsets.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2025 Arpad Panyik <Arpad.Panyik@arm.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_AARCH64_ASM_OFFSETS_H
+#define SWSCALE_AARCH64_ASM_OFFSETS_H
+
+/* SwsLuts */
+#define SL_IN  0x00
+#define SL_OUT 0x08
+
+/* SwsColorXform */
+#define SCX_GAMMA     0x00
+#define SCX_MAT       0x10
+#define SCX_GAMMA_IN  (SCX_GAMMA + SL_IN)
+#define SCX_GAMMA_OUT (SCX_GAMMA + SL_OUT)
+#define SCX_MAT_00    SCX_MAT
+#define SCX_MAT_22    (SCX_MAT + 8 * 2)
+
+#endif /* SWSCALE_AARCH64_ASM_OFFSETS_H */
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 55fff03a5a..4f86364f4b 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -21,6 +21,35 @@
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
 #include "libavutil/aarch64/cpu.h"
+#include "asm-offsets.h"
+
+#define SIZEOF_MEMBER(type, member) \
+    sizeof(((type*)0)->member)
+
+static_assert(offsetof(SwsLuts, in)  == SL_IN,  "struct layout mismatch");
+static_assert(offsetof(SwsLuts, out) == SL_OUT, "struct layout mismatch");
+
+static_assert(offsetof(SwsColorXform, gamma) == SCX_GAMMA,
+              "struct layout mismatch");
+static_assert(offsetof(SwsColorXform, mat) == SCX_MAT,
+              "struct layout mismatch");
+
+static_assert(offsetof(SwsColorXform, mat) +
+              2 * SIZEOF_MEMBER(SwsColorXform, mat[0]) +
+              2 * SIZEOF_MEMBER(SwsColorXform, mat[0][0]) == SCX_MAT_22,
+              "struct layout mismatch");
+
+void ff_xyz12Torgb48le_neon_asm(const SwsColorXform *c, uint8_t *dst,
+                                int dst_stride, const uint8_t *src,
+                                int src_stride, int w, int h);
+
+static void xyz12Torgb48le_neon(const SwsInternal *c, uint8_t *dst,
+                                int dst_stride, const uint8_t *src,
+                                int src_stride, int w, int h)
+{
+    ff_xyz12Torgb48le_neon_asm(&c->xyz2rgb, dst, dst_stride, src, src_stride,
+                               w, h);
+}
 
 void ff_hscale16to15_4_neon_asm(int shift, int16_t *_dst, int dstW,
                       const uint8_t *_src, const int16_t *filter,
@@ -307,6 +336,17 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
     }
 }
 
+av_cold void ff_sws_init_xyzdsp_aarch64(SwsInternal *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        if (!isBE(c->opts.src_format)) {
+            c->xyz12Torgb48 = xyz12Torgb48le_neon;
+        }
+    }
+}
+
 av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libswscale/aarch64/xyz2rgb_neon.S b/libswscale/aarch64/xyz2rgb_neon.S
new file mode 100644
index 0000000000..4b2135085a
--- /dev/null
+++ b/libswscale/aarch64/xyz2rgb_neon.S
@@ -0,0 +1,702 @@
+/*
+ * Copyright (c) 2025 Arpad Panyik <Arpad.Panyik@arm.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "asm-offsets.h"
+
+#define JUMP_ALIGN 2
+#define LOOP_ALIGN 2
+
+function ff_xyz12Torgb48le_neon_asm, export=1
+// x0  const SwsColorXform *c
+// x1  uint8_t *dst
+// w2  int dst_stride
+// x3  const uint8_t *src
+// w4  int src_stride
+// w5  int w
+// w6  int h
+
+        ldp             x7,  x8, [x0, #(SCX_GAMMA_IN)]  // gamma.in, gamma.out
+        ldr             q6,  [x0, #(SCX_MAT_00)]        // mat[0][0]..[2][1]
+        ldr             h7,  [x0, #(SCX_MAT_22)]        // mat[2][2]; > 0
+
+        add             w9,  w5,  w5, lsl #1        // w * 3
+        add             x17, x3,  w4, sxtw          // sr2 = src + src_stride
+        add             x16, x1,  w2, sxtw          // ds2 = dst + dst_stride
+        sub             w4,  w4,  w9                // src_stride - w * 3
+        sub             w2,  w2,  w9                // dst_stride - w * 3
+        abs             v6.8h,  v6.8h               // abs(mat[0][0]..[2][1])
+        sbfiz           x4,  x4,  #1, #32           // src_stride * 2 - w * 6
+        sbfiz           x2,  x2,  #1, #32           // dst_stride * 2 - w * 6
+
+        subs            w6,  w6,  #2
+        b.lt            6f                          // h < 2
+
+        stp             x19, x20, [sp, #-64]!
+        stp             x21, x22, [sp, #16]
+        stp             x23, x24, [sp, #32]
+        str             x25, [sp, #48]
+
+        .align LOOP_ALIGN
+1:      // yp loop for 2x4 pixels
+        subs            w0,  w5,  #4
+        b.lt            3f                          // w < 4
+
+        .align LOOP_ALIGN
+2:      // xp loop for 2x4 pixels: XYZ0[0..3], XYZ1[0..3]
+        ldp             x9,  x10, [x3]              // x9  = X0[0] Y0[0] Z0[0] X0[1], x10 = Y0[1] Z0[1] X0[2] Y0[2]
+        ldr             x11, [x3, #16]              // x11 = Z0[2] X0[3] Y0[3] Z0[3]
+        add             x3,  x3,  #24
+        ubfx            x12, x9,  #4,  #12          // X0[0] >> 4
+        lsr             x13, x9,  #52               // X0[1] >> 4
+        ubfx            x14, x10, #36, #12          // X0[2] >> 4
+        ubfx            x15, x11, #20, #12          // X0[3] >> 4
+
+        ldp             x19, x20, [x17]             // x19 = X1[0] Y1[0] Z1[0] X1[1], x20 = Y1[1] Z1[1] X1[2] Y1[2]
+        ldr             x21, [x17, #16]             // x21 = Z1[2] X1[3] Y1[3] Z1[3]
+        add             x17, x17, #24
+        ubfx            x22, x19, #4, #12           // X1[0] >> 4
+        lsr             x23, x19, #52               // X1[1] >> 4
+        ubfx            x24, x20, #36, #12          // X1[2] >> 4
+        ubfx            x25, x21, #20, #12          // X1[3] >> 4
+
+        ldr             h0,  [x7, x12, lsl #1]      // gamma.in[X0[0] >> 4]
+        ubfx            x12, x9,  #20, #12          // Y0[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.in[X0[1] >> 4]
+        ubfx            x13, x10, #4, #12           // Y0[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.in[X0[2] >> 4]
+        lsr             x14, x10, #52               // Y0[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.in[X0[3] >> 4]
+        ubfx            x15, x11, #36, #12          // Y0[3] >> 4
+
+        ldr             h20, [x7, x22, lsl #1]      // gamma.in[X1[0] >> 4]
+        ubfx            x22, x19, #20, #12          // Y1[0] >> 4
+        ldr             h26, [x7, x23, lsl #1]      // gamma.in[X1[1] >> 4]
+        ubfx            x23, x20, #4,  #12          // Y1[1] >> 4
+        ldr             h27, [x7, x24, lsl #1]      // gamma.in[X1[2] >> 4]
+        lsr             x24, x20, #52               // Y1[2] >> 4
+        ldr             h28, [x7, x25, lsl #1]      // gamma.in[X1[3] >> 4]
+        ubfx            x25, x21, #36, #12          // Y1[3] >> 4
+
+        mov             v0.h[1],  v16.h[0]          // v0.4h  = gamma.in[X0[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.in[X0[2..3] >> 4]
+        mov             v0.s[1],  v17.s[0]          // v0.4h  = gamma.in[X0[0..3] >> 4]
+        ldr             h1,  [x7, x12, lsl #1]      // gamma.in[Y0[0] >> 4]
+        umull           v3.4s, v0.4h, v6.h[0]       // R0[0..3] = gamma.in[X0[0..3] >> 4] * mat[0][0]
+        umull           v5.4s, v0.4h, v6.h[6]       // B0[0..3] = gamma.in[X0[0..3] >> 4] * mat[2][0]
+        ubfx            x12, x9,  #36, #12          // Z0[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.in[Y0[1] >> 4]
+
+        mov             v20.h[1], v26.h[0]          // v20.4h = gamma.in[X1[0..1] >> 4]
+        mov             v27.h[1], v28.h[0]          // v27.4h = gamma.in[X1[2..3] >> 4]
+        mov             v20.s[1], v27.s[0]          // v20.4h = gamma.in[X1[0..3] >> 4]
+        ldr             h21, [x7, x22, lsl #1]      // gamma.in[Y1[0] >> 4]
+        umull           v23.4s, v20.4h, v6.h[0]     // R1[0..3] = gamma.in[X1[0..3] >> 4] * mat[0][0]
+        umull           v25.4s, v20.4h, v6.h[6]     // B1[0..3] = gamma.in[X1[0..3] >> 4] * mat[2][0]
+        ubfx            x22, x19, #36, #12          // Z1[0] >> 4
+        ldr             h26, [x7, x23, lsl #1]      // gamma.in[Y1[1] >> 4]
+
+        ubfx            x13, x10, #20, #12          // Z0[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.in[Y0[2] >> 4]
+        ubfx            x14, x11, #4,  #12          // Z0[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.in[Y0[3] >> 4]
+        lsr             x15, x11, #52               // Z0[3] >> 4
+        mov             v1.h[1],  v16.h[0]          // v1.4h  = gamma.in[Y0[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.in[Y0[2..3] >> 4]
+        mov             v1.s[1],  v17.s[0]          // v1.4h  = gamma.in[Y0[0..3] >> 4]
+
+        ubfx            x23, x20, #20, #12          // Z1[1] >> 4
+        ldr             h27, [x7, x24, lsl #1]      // gamma.in[Y1[2] >> 4]
+        ubfx            x24, x21, #4,  #12          // Z1[2] >> 4
+        ldr             h28, [x7, x25, lsl #1]      // gamma.in[Y1[3] >> 4]
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G0[0..3]  = gamma.in[Y0[0..3] >> 4] * mat[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R0[0..3] -= gamma.in[Y0[0..3] >> 4] * mat[0][1]
+
+        lsr             x25, x21, #52               // Z1[3] >> 4
+        mov             v21.h[1], v26.h[0]          // v21.4h = gamma.in[Y1[0..1] >> 4]
+        mov             v27.h[1], v28.h[0]          // v27.4h = gamma.in[Y1[2..3] >> 4]
+        mov             v21.s[1], v27.s[0]          // v21.4h = gamma.in[Y1[0..3] >> 4]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G0[0..3] -= gamma.in[X0[0..3] >> 4] * mat[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B0[0..3] -= gamma.in[Y0[0..3] >> 4] * mat[2][1]
+
+        ldr             h2,  [x7, x12, lsl #1]      // gamma.in[Z0[0] >> 4]
+        ldr             h16, [x7, x13, lsl #1]      // gamma.in[Z0[1] >> 4]
+        ldr             h17, [x7, x14, lsl #1]      // gamma.in[Z0[2] >> 4]
+        ldr             h18, [x7, x15, lsl #1]      // gamma.in[Z0[3] >> 4]
+        umull           v24.4s, v21.4h, v6.h[4]     // G1[0..3]  = gamma.in[Y1[0..3] >> 4] * mat[1][1]
+        umlsl           v23.4s, v21.4h, v6.h[1]     // R1[0..3] -= gamma.in[Y1[0..3] >> 4] * mat[0][1]
+
+        mov             v2.h[1],  v16.h[0]          // v2.4h  = gamma.in[Z0[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.in[Z0[2..3] >> 4]
+        mov             v2.s[1],  v17.s[0]          // v2.4h  = gamma.in[Z0[0..3] >> 4]
+        umlsl           v24.4s, v20.4h, v6.h[3]     // G1[0..3] -= gamma.in[X1[0..3] >> 4] * mat[1][0]
+        umlsl           v25.4s, v21.4h, v6.h[7]     // B1[0..3] -= gamma.in[Y1[0..3] >> 4] * mat[2][1]
+
+        ldr             h22, [x7, x22, lsl #1]      // gamma.in[Z1[0] >> 4]
+        ldr             h26, [x7, x23, lsl #1]      // gamma.in[Z1[1] >> 4]
+        ldr             h27, [x7, x24, lsl #1]      // gamma.in[Z1[2] >> 4]
+        ldr             h28, [x7, x25, lsl #1]      // gamma.in[Z1[3] >> 4]
+        mov             v22.h[1], v26.h[0]          // v22.4h = gamma.in[Z1[0..1] >> 4]
+        mov             v27.h[1], v28.h[0]          // v27.4h = gamma.in[Z1[2..3] >> 4]
+        mov             v22.s[1], v27.s[0]          // v22.4h = gamma.in[Z1[0..3] >> 4]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R0[0..3] -= gamma.in[Z0[0..3] >> 4] * mat[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R0[0..3] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G0[0..3] += gamma.in[Z0[0..3] >> 4] * mat[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G0[0..3] >> 12)
+        umov            w9,  v3.h[0]                // clip(R0[0] >> 12)
+        umov            w10, v4.h[1]                // clip(G0[1] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B0[0..3] += gamma.in[Z0[0..3] >> 4] * mat[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B0[0..3] >> 12)
+
+        umlsl           v23.4s, v22.4h, v6.h[2]     // R1[0..3] -= gamma.in[Z1[0..3] >> 4] * mat[0][2]
+        sqshrun         v23.4h, v23.4s, #12         // clip(R1[0..3] >> 12)
+        umlal           v24.4s, v22.4h, v6.h[5]     // G1[0..3] += gamma.in[Z1[0..3] >> 4] * mat[1][2]
+        sqshrun         v24.4h, v24.4s, #12         // clip(G1[0..3] >> 12)
+        umov            w19, v23.h[0]               // clip(R1[0] >> 12)
+        umov            w20, v24.h[1]               // clip(G1[1] >> 12)
+        umlal           v25.4s, v22.4h, v7.h[0]     // B1[0..3] += gamma.in[Z1[0..3] >> 4] * mat[2][2]
+        sqshrun         v25.4h, v25.4s, #12         // clip(B1[0..3] >> 12)
+
+        umov            w11, v5.h[2]                // clip(B0[2] >> 12)
+        umov            w12, v4.h[0]                // clip(G0[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[0] = gamma.out[clip(R0[0] >> 12)]
+        lsl             x9,  x9,  #4                // R0[0] << 4
+        umov            w13, v5.h[1]                // clip(B0[1] >> 12)
+        ldrh            w10, [x8, x10, lsl #1]      // G0[1] = gamma.out[clip(G0[1] >> 12)]
+        lsl             x10, x10, #4                // G0[1] << 4
+
+        umov            w21, v25.h[2]               // clip(B1[2] >> 12)
+        umov            w22, v24.h[0]               // clip(G1[0] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[0] = gamma.out[clip(R1[0] >> 12)]
+        lsl             x19, x19, #4                // R1[0] << 4
+        umov            w23, v25.h[1]               // clip(B1[1] >> 12)
+        ldrh            w20, [x8, x20, lsl #1]      // G1[1] = gamma.out[clip(G1[1] >> 12)]
+        lsl             x20, x20, #4                // G1[1] << 4
+
+        umov            w14, v3.h[3]                // clip(R0[3] >> 12)
+        ldrh            w11, [x8, x11, lsl #1]      // B0[2] = gamma.out[clip(B0[2] >> 12)]
+        lsl             x11, x11, #4                // B0[2] << 4
+        umov            w15, v5.h[0]                // clip(B0[0] >> 12)
+        ldrh            w12, [x8, x12, lsl #1]      // G0[0] = gamma.out[clip(G0[0] >> 12)]
+        orr             x9,  x9,  x12, lsl #20      // R0[0] << 4, G0[0] << 4
+        umov            w12, v3.h[2]                // clip(R0[2] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // B0[1] = gamma.out[clip(B0[1] >> 12)]
+
+        umov            w24, v23.h[3]               // clip(R1[3] >> 12)
+        ldrh            w21, [x8, x21, lsl #1]      // B1[2] = gamma.out[clip(B1[2] >> 12)]
+        lsl             x21, x21, #4                // B1[2] << 4
+        umov            w25, v25.h[0]               // clip(B1[0] >> 12)
+        ldrh            w22, [x8, x22, lsl #1]      // G1[0] = gamma.out[clip(G1[0] >> 12)]
+        orr             x19, x19, x22, lsl #20      // R1[0] << 4, G1[0] << 4
+        umov            w22, v23.h[2]               // clip(R1[2] >> 12)
+        ldrh            w23, [x8, x23, lsl #1]      // B1[1] = gamma.out[clip(B1[1] >> 12)]
+
+        orr             x10, x10, x13, lsl #20      // G0[1] << 4, B0[1] << 4
+        umov            w13, v4.h[3]                // clip(G0[3] >> 12)
+        ldrh            w14, [x8, x14, lsl #1]      // R0[3] = gamma.out[clip(R0[3] >> 12)]
+        orr             x11, x11, x14, lsl #20      // B0[2] << 4, R0[3] << 4
+        umov            w14, v3.h[1]                // clip(R0[1] >> 12)
+        ldrh            w15, [x8, x15, lsl #1]      // B0[0] = gamma.out[clip(B0[0] >> 12)]
+        orr             x9,  x9,  x15, lsl #36      // R0[0] << 4, G0[0] << 4, B0[0] << 4
+        umov            w15, v4.h[2]                // clip(G0[2] >> 12)
+
+        orr             x20, x20, x23, lsl #20      // G1[1] << 4, B1[1] << 4
+        umov            w23, v24.h[3]               // clip(G1[3] >> 12)
+        ldrh            w24, [x8, x24, lsl #1]      // R1[3] = gamma.out[clip(R1[3] >> 12)]
+        orr             x21, x21, x24, lsl #20      // B1[2] << 4, R1[3] << 4
+        umov            w24, v23.h[1]               // clip(R1[1] >> 12)
+        ldrh            w25, [x8, x25, lsl #1]      // B1[0] = gamma.out[clip(B1[0] >> 12)]
+        orr             x19, x19, x25, lsl #36      // R1[0] << 4, G1[0] << 4, B1[0] << 4
+        umov            w25, v24.h[2]               // clip(G1[2] >> 12)
+
+        ldrh            w12, [x8, x12, lsl #1]      // R0[2] = gamma.out[clip(R0[2] >> 12)]
+        orr             x10, x10, x12, lsl #36      // G0[1] << 4, B0[1] << 4, R0[2] << 4
+        umov            w12, v5.h[3]                // clip(B0[3] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // G0[3] = gamma.out[clip(G0[3] >> 12)]
+        orr             x11, x11, x13, lsl #36      // B0[2] << 4, R0[3] << 4, G0[3] << 4
+        ldrh            w14, [x8, x14, lsl #1]      // R0[1] = gamma.out[clip(R0[1] >> 12)]
+        orr             x9,  x9,  x14, lsl #52      // x9  = R0[0] << 4, G0[0] << 4, B0[0] << 4, R0[1] << 4
+        ldrh            w15, [x8, x15, lsl #1]      // G0[2] = gamma.out[clip(G0[2] >> 12)]
+        orr             x10, x10, x15, lsl #52      // x10 = G0[1] << 4, B0[1] << 4, R0[2] << 4, G0[2] << 4
+        ldrh            w12, [x8, x12, lsl #1]      // B0[3] = gamma.out[clip(B0[3] >> 12)]
+        orr             x11, x11, x12, lsl #52      // x11 = B0[2] << 4, R0[3] << 4, G0[3] << 4, B0[3] << 4
+        stp             x9,  x10, [x1]
+        str             x11, [x1, #16]
+
+        ldrh            w22, [x8, x22, lsl #1]      // R1[2] = gamma.out[clip(R1[2] >> 12)]
+        orr             x20, x20, x22, lsl #36      // G1[1] << 4, B1[1] << 4, R1[2] << 4
+        umov            w22, v25.h[3]               // clip(B1[3] >> 12)
+        ldrh            w23, [x8, x23, lsl #1]      // G1[3] = gamma.out[clip(G1[3] >> 12)]
+        orr             x21, x21, x23, lsl #36      // B1[2] << 4, R1[3] << 4, G1[3] << 4
+        ldrh            w24, [x8, x24, lsl #1]      // R1[1] = gamma.out[clip(R1[1] >> 12)]
+        orr             x19, x19, x24, lsl #52      // x19 = R1[0] << 4, G1[0] << 4, B1[0] << 4, R1[1] << 4
+        ldrh            w25, [x8, x25, lsl #1]      // G1[2] = gamma.out[clip(G1[2] >> 12)]
+        orr             x20, x20, x25, lsl #52      // x20 = G1[1] << 4, B1[1] << 4, R1[2] << 4, G1[2] << 4
+        ldrh            w22, [x8, x22, lsl #1]      // B1[3] = gamma.out[clip(B1[3] >> 12)]
+        orr             x21, x21, x22, lsl #52      // x21 = B1[2] << 4, R1[3] << 4, G1[3] << 4, B1[3] << 4
+        stp             x19, x20, [x16]
+        str             x21, [x16, #16]
+
+        add             x1,  x1,  #24
+        add             x16, x16, #24
+
+        subs            w0,  w0,  #4
+        b.ge            2b
+
+        .align JUMP_ALIGN
+3:
+        tst             w5,  #3
+        b.eq            5f                          // no residual pixels; (w & 3) == 0
+
+        ldr             w10, [x3]                   // w10 = X0[0] Y0[0]
+        ldrh            w11, [x3, #4]               // w11 = Z0[0]
+        add             x3,  x3,  #6
+        ldr             w20, [x17]                  // w20 = X1[0] Y1[0]
+        ldrh            w21, [x17, #4]              // w21 = Z1[0]
+        add             x17, x17, #6
+        ubfx            w9,  w10, #4,  #12          // X0[0] >> 4
+        ubfx            w10, w10, #20, #12          // Y0[0] >> 4
+        lsr             w11, w11, #4                // Z0[0] >> 4
+        ldr             h0,  [x7, x9,  lsl #1]      // v0.4h = gamma.in[X0[0] >> 4]
+        ldr             h1,  [x7, x10, lsl #1]      // v1.4h = gamma.in[Y0[0] >> 4]
+        ldr             h2,  [x7, x11, lsl #1]      // v2.4h = gamma.in[Z0[0] >> 4]
+        ubfx            w19, w20, #4,  #12          // X1[0] >> 4
+        ubfx            w20, w20, #20, #12          // Y1[0] >> 4
+        lsr             w21, w21, #4                // Z1[0] >> 4
+        ldr             h20, [x7, x19, lsl #1]      // v20.4h = gamma.in[X1[0] >> 4]
+        ldr             h21, [x7, x20, lsl #1]      // v21.4h = gamma.in[Y1[0] >> 4]
+        ldr             h22, [x7, x21, lsl #1]      // v22.4h = gamma.in[Z1[0] >> 4]
+
+        cmp             w0,  #-2
+        b.lt            4f                          // (w & 3) == 1
+
+        ldr             w10, [x3]                   // w10 = X0[1] Y0[1]
+        ldrh            w11, [x3, #4]               // w11 = Z0[1]
+        add             x3,  x3,  #6
+        ldr             w20, [x17]                  // w20 = X1[1] Y1[1]
+        ldrh            w21, [x17, #4]              // w21 = Z1[1]
+        add             x17, x17,  #6
+        ubfx            w9,  w10, #4,  #12          // X0[1] >> 4
+        ubfx            w10, w10, #20, #12          // Y0[1] >> 4
+        lsr             w11, w11, #4                // Z0[1] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.in[X0[1] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.in[Y0[1] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.in[Z0[1] >> 4]
+        ubfx            w19, w20, #4,  #12          // X1[1] >> 4
+        ubfx            w20, w20, #20, #12          // Y1[1] >> 4
+        lsr             w21, w21, #4                // Z1[1] >> 4
+        ldr             h23, [x7, x19, lsl #1]      // gamma.in[X1[1] >> 4]
+        ldr             h24, [x7, x20, lsl #1]      // gamma.in[Y1[1] >> 4]
+        ldr             h25, [x7, x21, lsl #1]      // gamma.in[Z1[1] >> 4]
+        mov             v0.h[1],  v16.h[0]          // v0.4h = gamma.in[X0[0..1] >> 4]
+        mov             v1.h[1],  v17.h[0]          // v1.4h = gamma.in[Y0[0..1] >> 4]
+        mov             v2.h[1],  v18.h[0]          // v2.4h = gamma.in[Z0[0..1] >> 4]
+        mov             v20.h[1], v23.h[0]          // v20.4h = gamma.in[X1[0..1] >> 4]
+        mov             v21.h[1], v24.h[0]          // v21.4h = gamma.in[Y1[0..1] >> 4]
+        mov             v22.h[1], v25.h[0]          // v22.4h = gamma.in[Z1[0..1] >> 4]
+
+        b.le            4f                          // (w & 3) == 2
+
+        ldr             w10, [x3]                   // w10 = X0[2] Y0[2]
+        ldrh            w11, [x3, #4]               // w11 = Z0[2]
+        add             x3,  x3,  #6
+        ldr             w20, [x17]                  // w20 = X1[2] Y1[2]
+        ldrh            w21, [x17, #4]              // w21 = Z1[2]
+        add             x17, x17, #6
+        ubfx            w9,  w10, #4,  #12          // X0[2] >> 4
+        ubfx            w10, w10, #20, #12          // Y0[2] >> 4
+        lsr             w11, w11, #4                // Z0[2] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.in[X0[2] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.in[Y0[2] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.in[Z0[2] >> 4]
+        ubfx            w19, w20, #4,  #12          // X1[2] >> 4
+        ubfx            w20, w20, #20, #12          // Y1[2] >> 4
+        lsr             w21, w21, #4                // Z1[2] >> 4
+        ldr             h23, [x7, x19, lsl #1]      // gamma.in[X1[2] >> 4]
+        ldr             h24, [x7, x20, lsl #1]      // gamma.in[Y1[2] >> 4]
+        ldr             h25, [x7, x21, lsl #1]      // gamma.in[Z1[2] >> 4]
+        mov             v0.h[2],  v16.h[0]          // v0.4h = gamma.in[X0[0..2] >> 4]
+        mov             v1.h[2],  v17.h[0]          // v1.4h = gamma.in[Y0[0..2] >> 4]
+        mov             v2.h[2],  v18.h[0]          // v2.4h = gamma.in[Z0[0..2] >> 4]
+        mov             v20.h[2], v23.h[0]          // v20.4h = gamma.in[X1[0..2] >> 4]
+        mov             v21.h[2], v24.h[0]          // v21.4h = gamma.in[Y1[0..2] >> 4]
+        mov             v22.h[2], v25.h[0]          // v22.4h = gamma.in[Z1[0..2] >> 4]
+
+        .align JUMP_ALIGN
+4:
+        umull           v3.4s,  v0.4h,  v6.h[0]     // R0[0..2] = gamma.in[X0[0..2] >> 4] * mat[0][0]
+        umull           v5.4s,  v0.4h,  v6.h[6]     // B0[0..2] = gamma.in[X0[0..2] >> 4] * mat[2][0]
+
+        umull           v23.4s, v20.4h, v6.h[0]     // R1[0..2] = gamma.in[X1[0..2] >> 4] * mat[0][0]
+        umull           v25.4s, v20.4h, v6.h[6]     // B1[0..2] = gamma.in[X1[0..2] >> 4] * mat[2][0]
+
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G0[0..2]  = gamma.in[Y0[0..2] >> 4] * mat[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R0[0..2] -= gamma.in[Y0[0..2] >> 4] * mat[0][1]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G0[0..2] -= gamma.in[X0[0..2] >> 4] * mat[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B0[0..2] -= gamma.in[Y0[0..2] >> 4] * mat[2][1]
+
+        umull           v24.4s, v21.4h, v6.h[4]     // G1[0..2]  = gamma.in[Y1[0..2] >> 4] * mat[1][1]
+        umlsl           v23.4s, v21.4h, v6.h[1]     // R1[0..2] -= gamma.in[Y1[0..2] >> 4] * mat[0][1]
+        umlsl           v24.4s, v20.4h, v6.h[3]     // G1[0..2] -= gamma.in[X1[0..2] >> 4] * mat[1][0]
+        umlsl           v25.4s, v21.4h, v6.h[7]     // B1[0..2] -= gamma.in[Y1[0..2] >> 4] * mat[2][1]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R0[0..2] -= gamma.in[Z0[0..2] >> 4] * mat[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R0[0..2] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G0[0..2] += gamma.in[Z0[0..2] >> 4] * mat[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G0[0..2] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B0[0..2] += gamma.in[Z0[0..2] >> 4] * mat[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B0[0..2] >> 12)
+
+        umlsl           v23.4s, v22.4h, v6.h[2]     // R1[0..2] -= gamma.in[Z1[0..2] >> 4] * mat[0][2]
+        sqshrun         v23.4h, v23.4s, #12         // clip(R1[0..2] >> 12)
+        umlal           v24.4s, v22.4h, v6.h[5]     // G1[0..2] += gamma.in[Z1[0..2] >> 4] * mat[1][2]
+        sqshrun         v24.4h, v24.4s, #12         // clip(G1[0..2] >> 12)
+        umlal           v25.4s, v22.4h, v7.h[0]     // B1[0..2] += gamma.in[Z1[0..2] >> 4] * mat[2][2]
+        sqshrun         v25.4h, v25.4s, #12         // clip(B1[0..2] >> 12)
+
+        umov            w9,  v3.h[0]                // clip(R0[0] >> 12)
+        umov            w10, v4.h[0]                // clip(G0[0] >> 12)
+        umov            w11, v5.h[0]                // clip(B0[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[0] = gamma.out[clip(R0[0] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G0[0] = gamma.out[clip(G0[0] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B0[0] = gamma.out[clip(B0[0] >> 12)]
+        umov            w19, v23.h[0]               // clip(R1[0] >> 12)
+        umov            w20, v24.h[0]               // clip(G1[0] >> 12)
+        umov            w21, v25.h[0]               // clip(B1[0] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[0] = gamma.out[clip(R1[0] >> 12)]
+        ldrh            w20, [x8, x20, lsl #1]      // G1[0] = gamma.out[clip(G1[0] >> 12)]
+        ldrh            w21, [x8, x21, lsl #1]      // B1[0] = gamma.out[clip(B1[0] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R0[0] << 4
+        lsl             w10, w10, #4                // w10 = G0[0] << 4
+        lsl             w11, w11, #4                // w11 = B0[0] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        lsl             w19, w19, #4                // w19 = R1[0] << 4
+        lsl             w20, w20, #4                // w20 = G1[0] << 4
+        lsl             w21, w21, #4                // w21 = B1[0] << 4
+        strh            w19, [x16]
+        strh            w20, [x16, #2]
+        strh            w21, [x16, #4]
+        add             x1,  x1,  #6
+        add             x16, x16, #6
+
+        cmp             w0,  #-2
+        b.lt            5f                          // (w & 3) == 1
+
+        umov            w9,  v3.h[1]                // clip(R0[1] >> 12)
+        umov            w10, v4.h[1]                // clip(G0[1] >> 12)
+        umov            w11, v5.h[1]                // clip(B0[1] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[1] = gamma.out[clip(R0[1] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G0[1] = gamma.out[clip(G0[1] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B0[1] = gamma.out[clip(B0[1] >> 12)]
+        umov            w19, v23.h[1]               // clip(R1[1] >> 12)
+        umov            w20, v24.h[1]               // clip(G1[1] >> 12)
+        umov            w21, v25.h[1]               // clip(B1[1] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[1] = gamma.out[clip(R1[1] >> 12)]
+        ldrh            w20, [x8, x20, lsl #1]      // G1[1] = gamma.out[clip(G1[1] >> 12)]
+        ldrh            w21, [x8, x21, lsl #1]      // B1[1] = gamma.out[clip(B1[1] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R0[1] << 4
+        lsl             w10, w10, #4                // w10 = G0[1] << 4
+        lsl             w11, w11, #4                // w11 = B0[1] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        lsl             w19, w19, #4                // w19 = R1[1] << 4
+        lsl             w20, w20, #4                // w20 = G1[1] << 4
+        lsl             w21, w21, #4                // w21 = B1[1] << 4
+        strh            w19, [x16]
+        strh            w20, [x16, #2]
+        strh            w21, [x16, #4]
+        add             x1,  x1,  #6
+        add             x16, x16, #6
+
+        b.le            5f                          // (w & 3) == 2
+
+        umov            w9,  v3.h[2]                // clip(R0[2] >> 12)
+        umov            w10, v4.h[2]                // clip(G0[2] >> 12)
+        umov            w11, v5.h[2]                // clip(B0[2] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R0[2] = gamma.out[clip(R0[2] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G0[2] = gamma.out[clip(G0[2] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B0[2] = gamma.out[clip(B0[2] >> 12)]
+        umov            w19, v23.h[2]               // clip(R1[2] >> 12)
+        umov            w20, v24.h[2]               // clip(G1[2] >> 12)
+        umov            w21, v25.h[2]               // clip(B1[2] >> 12)
+        ldrh            w19, [x8, x19, lsl #1]      // R1[2] = gamma.out[clip(R1[2] >> 12)]
+        ldrh            w20, [x8, x20, lsl #1]      // G1[2] = gamma.out[clip(G1[2] >> 12)]
+        ldrh            w21, [x8, x21, lsl #1]      // B1[2] = gamma.out[clip(B1[2] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R0[2] << 4
+        lsl             w10, w10, #4                // w10 = G0[2] << 4
+        lsl             w11, w11, #4                // w11 = B0[2] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        lsl             w19, w19, #4                // w19 = R1[2] << 4
+        lsl             w20, w20, #4                // w20 = G1[2] << 4
+        lsl             w21, w21, #4                // w21 = B1[2] << 4
+        strh            w19, [x16]
+        strh            w20, [x16, #2]
+        strh            w21, [x16, #4]
+        add             x1,  x1,  #6
+        add             x16, x16, #6
+
+        .align JUMP_ALIGN
+5:
+        add             x3,  x3,  x4
+        add             x17, x17, x4
+        add             x1,  x1,  x2
+        add             x16, x16, x2
+
+        subs            w6,  w6,  #2
+        b.ge            1b
+
+        ldp             x21, x22, [sp, #16]
+        ldp             x23, x24, [sp, #32]
+        ldr             x25, [sp, #48]
+        ldp             x19, x20, [sp], #64
+
+        .align JUMP_ALIGN
+6:
+        tbz             w6,  #0,  10f               // even number of lines; (h & 1) == 0
+
+        subs            w0,  w5,  #4
+        b.lt            8f                          // w < 4
+
+        .align LOOP_ALIGN
+7:      // loop for last odd line by 4 pixels: XYZ[0..3]
+        ldp             x9,  x10, [x3]              // x9  = X[0] Y[0] Z[0] X[1], x10 = Y[1] Z[1] X[2] Y[2]
+        ldr             x11, [x3, #16]              // x11 = Z[2] X[3] Y[3] Z[3]
+        add             x3,  x3,  #24
+
+        ubfx            x12, x9,  #4,  #12          // X[0] >> 4
+        lsr             x13, x9,  #52               // X[1] >> 4
+        ubfx            x14, x10, #36, #12          // X[2] >> 4
+        ubfx            x15, x11, #20, #12          // X[3] >> 4
+
+        ldr             h0,  [x7, x12, lsl #1]      // gamma.in[X[0] >> 4]
+        ubfx            x12, x9,  #20, #12          // Y[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.in[X[1] >> 4]
+        ubfx            x13, x10, #4,  #12          // Y[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.in[X[2] >> 4]
+        lsr             x14, x10, #52               // Y[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.in[X[3] >> 4]
+        ubfx            x15, x11, #36, #12          // Y[3] >> 4
+        mov             v0.h[1],  v16.h[0]          // v0.4h  = gamma.in[X[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.in[X[2..3] >> 4]
+        mov             v0.s[1],  v17.s[0]          // v0.4h  = gamma.in[X[0..3] >> 4]
+
+        umull           v3.4s,  v0.4h,  v6.h[0]     // R[0..3] = gamma.in[X[0..3] >> 4] * mat[0][0]
+        umull           v5.4s,  v0.4h,  v6.h[6]     // B[0..3] = gamma.in[X[0..3] >> 4] * mat[2][0]
+
+        ldr             h1,  [x7, x12, lsl #1]      // gamma.in[Y[0] >> 4]
+        ubfx            x12, x9,  #36, #12          // Z[0] >> 4
+        ldr             h16, [x7, x13, lsl #1]      // gamma.in[Y[1] >> 4]
+        ubfx            x13, x10, #20, #12          // Z[1] >> 4
+        ldr             h17, [x7, x14, lsl #1]      // gamma.in[Y[2] >> 4]
+        ubfx            x14, x11, #4,  #12          // Z[2] >> 4
+        ldr             h18, [x7, x15, lsl #1]      // gamma.in[Y[3] >> 4]
+        lsr             x15, x11, #52               // Z[3] >> 4
+        mov             v1.h[1],  v16.h[0]          // v1.4h  = gamma.in[Y[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.in[Y[2..3] >> 4]
+        mov             v1.s[1],  v17.s[0]          // v1.4h  = gamma.in[Y[0..3] >> 4]
+
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G[0..3]  = gamma.in[Y[0..3] >> 4] * mat[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R[0..3] -= gamma.in[Y[0..3] >> 4] * mat[0][1]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G[0..3] -= gamma.in[X[0..3] >> 4] * mat[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B[0..3] -= gamma.in[Y[0..3] >> 4] * mat[2][1]
+
+        ldr             h2,  [x7, x12, lsl #1]      // gamma.in[Z[0] >> 4]
+        ldr             h16, [x7, x13, lsl #1]      // gamma.in[Z[1] >> 4]
+        ldr             h17, [x7, x14, lsl #1]      // gamma.in[Z[2] >> 4]
+        ldr             h18, [x7, x15, lsl #1]      // gamma.in[Z[3] >> 4]
+        mov             v2.h[1],  v16.h[0]          // v2.4h  = gamma.in[Z[0..1] >> 4]
+        mov             v17.h[1], v18.h[0]          // v17.4h = gamma.in[Z[2..3] >> 4]
+        mov             v2.s[1],  v17.s[0]          // v2.4h  = gamma.in[Z[0..3] >> 4]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R[0..3] -= gamma.in[Z[0..3] >> 4] * mat[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R[0..3] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G[0..3] += gamma.in[Z[0..3] >> 4] * mat[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G[0..3] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B[0..3] += gamma.in[Z[0..3] >> 4] * mat[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B[0..3] >> 12)
+
+        umov            w9,  v3.h[0]                // clip(R[0] >> 12)
+        umov            w10, v4.h[1]                // clip(G[1] >> 12)
+        umov            w11, v5.h[2]                // clip(B[2] >> 12)
+
+        umov            w12, v4.h[0]                // clip(G[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[0] = gamma.out[clip(R[0] >> 12)]
+        lsl             x9,  x9,  #4                // R[0] << 4
+        umov            w13, v5.h[1]                // clip(B[1] >> 12)
+        ldrh            w10, [x8, x10, lsl #1]      // G[1] = gamma.out[clip(G[1] >> 12)]
+        lsl             x10, x10, #4                // G[1] << 4
+        umov            w14, v3.h[3]                // clip(R[3] >> 12)
+        ldrh            w11, [x8, x11, lsl #1]      // B[2] = gamma.out[clip(B[2] >> 12)]
+        lsl             x11, x11, #4                // B[2] << 4
+
+        umov            w15, v5.h[0]                // clip(B[0] >> 12)
+        ldrh            w12, [x8, x12, lsl #1]      // G[0] = gamma.out[clip(G[0] >> 12)]
+        orr             x9,  x9,  x12, lsl #20      // R[0] << 4, G[0] << 4
+        umov            w12, v3.h[2]                // clip(R[2] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // B[1] = gamma.out[clip(B[1] >> 12)]
+        orr             x10, x10, x13, lsl #20      // G[1] << 4, B[1] << 4
+        umov            w13, v4.h[3]                // clip(G[3] >> 12)
+        ldrh            w14, [x8, x14, lsl #1]      // R[3] = gamma.out[clip(R[3] >> 12)]
+        orr             x11, x11, x14, lsl #20      // B[2] << 4, R[3] << 4
+
+        umov            w14, v3.h[1]                // clip(R[1] >> 12)
+        ldrh            w15, [x8, x15, lsl #1]      // B[0] = gamma.out[clip(B[0] >> 12)]
+        orr             x9,  x9,  x15, lsl #36      // R[0] << 4, G[0] << 4, B[0] << 4
+        umov            w15, v4.h[2]                // clip(G[2] >> 12)
+        ldrh            w12, [x8, x12, lsl #1]      // R[2] = gamma.out[clip(R[2] >> 12)]
+        orr             x10, x10, x12, lsl #36      // G[1] << 4, B[1] << 4, R[2] << 4
+        umov            w12, v5.h[3]                // clip(B[3] >> 12)
+        ldrh            w13, [x8, x13, lsl #1]      // G[3] = gamma.out[clip(G[3] >> 12)]
+        orr             x11, x11, x13, lsl #36      // B[2] << 4, R[3] << 4, G[3] << 4
+
+        ldrh            w14, [x8, x14, lsl #1]      // R[1] = gamma.out[clip(R[1] >> 12)]
+        orr             x9,  x9,  x14, lsl #52      // x9  = R[0] << 4, G[0] << 4, B[0] << 4, R[1] << 4
+        ldrh            w15, [x8, x15, lsl #1]      // G[2] = gamma.out[clip(G[2] >> 12)]
+        orr             x10, x10, x15, lsl #52      // x10 = G[1] << 4, B[1] << 4, R[2] << 4, G[2] << 4
+        ldrh            w12, [x8, x12, lsl #1]      // B[3] = gamma.out[clip(B[3] >> 12)]
+        orr             x11, x11, x12, lsl #52      // x11 = B[2] << 4, R[3] << 4, G[3] << 4, B[3] << 4
+
+        stp             x9,  x10, [x1]
+        str             x11, [x1, #16]
+        add             x1,  x1,  #24
+
+        subs            w0,  w0,  #4
+        b.ge            7b
+
+        .align JUMP_ALIGN
+8:
+        tst             w5,  #3
+        b.eq            10f                         // no residual pixels; (w & 3) == 0
+
+        ldr             w10, [x3]                   // w10 = X[0] Y[0]
+        ldrh            w11, [x3, #4]               // w11 = Z[0]
+        add             x3,  x3,  #6
+        ubfx            w9,  w10, #4,  #12          // X[0] >> 4
+        ubfx            w10, w10, #20, #12          // Y[0] >> 4
+        lsr             w11, w11, #4                // Z[0] >> 4
+        ldr             h0,  [x7, x9,  lsl #1]      // v0.4h = gamma.in[X[0] >> 4]
+        ldr             h1,  [x7, x10, lsl #1]      // v1.4h = gamma.in[Y[0] >> 4]
+        ldr             h2,  [x7, x11, lsl #1]      // v2.4h = gamma.in[Z[0] >> 4]
+
+        cmp             w0,  #-2
+        b.lt            9f                          // (w & 3) == 1
+
+        ldr             w10, [x3]                   // w10 = X[1] Y[1]
+        ldrh            w11, [x3, #4]               // w11 = Z[1]
+        add             x3,  x3,  #6
+        ubfx            w9,  w10, #4, #12           // X[1] >> 4
+        ubfx            w10, w10, #20, #12          // Y[1] >> 4
+        lsr             w11, w11, #4                // Z[1] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.in[X[1] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.in[Y[1] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.in[Z[1] >> 4]
+        mov             v0.h[1], v16.h[0]           // v0.4h = gamma.in[X[0..1] >> 4]
+        mov             v1.h[1], v17.h[0]           // v1.4h = gamma.in[Y[0..1] >> 4]
+        mov             v2.h[1], v18.h[0]           // v2.4h = gamma.in[Z[0..1] >> 4]
+
+        b.le            9f                          // (w & 3) == 2
+
+        ldr             w10, [x3]                   // w10 = X[2] Y[2]
+        ldrh            w11, [x3, #4]               // w11 = Z[2]
+        add             x3,  x3,  #6
+        ubfx            w9,  w10, #4, #12           // X[2] >> 4
+        ubfx            w10, w10, #20, #12          // Y[2] >> 4
+        lsr             w11, w11, #4                // Z[2] >> 4
+        ldr             h16, [x7, x9,  lsl #1]      // gamma.in[X[2] >> 4]
+        ldr             h17, [x7, x10, lsl #1]      // gamma.in[Y[2] >> 4]
+        ldr             h18, [x7, x11, lsl #1]      // gamma.in[Z[2] >> 4]
+        mov             v0.h[2], v16.h[0]           // v0.4h = gamma.in[X[0..2] >> 4]
+        mov             v1.h[2], v17.h[0]           // v1.4h = gamma.in[Y[0..2] >> 4]
+        mov             v2.h[2], v18.h[0]           // v2.4h = gamma.in[Z[0..2] >> 4]
+
+        .align JUMP_ALIGN
+9:
+        umull           v3.4s,  v0.4h,  v6.h[0]     // R[0..2] = gamma.in[X[0..2] >> 4] * mat[0][0]
+        umull           v5.4s,  v0.4h,  v6.h[6]     // B[0..2] = gamma.in[X[0..2] >> 4] * mat[2][0]
+
+        umull           v4.4s,  v1.4h,  v6.h[4]     // G[0..2]  = gamma.in[Y[0..2] >> 4] * mat[1][1]
+        umlsl           v3.4s,  v1.4h,  v6.h[1]     // R[0..2] -= gamma.in[Y[0..2] >> 4] * mat[0][1]
+        umlsl           v4.4s,  v0.4h,  v6.h[3]     // G[0..2] -= gamma.in[X[0..2] >> 4] * mat[1][0]
+        umlsl           v5.4s,  v1.4h,  v6.h[7]     // B[0..2] -= gamma.in[Y[0..2] >> 4] * mat[2][1]
+
+        umlsl           v3.4s,  v2.4h,  v6.h[2]     // R[0..2] -= gamma.in[Z[0..2] >> 4] * mat[0][2]
+        sqshrun         v3.4h,  v3.4s,  #12         // clip(R[0..2] >> 12)
+        umlal           v4.4s,  v2.4h,  v6.h[5]     // G[0..2] += gamma.in[Z[0..2] >> 4] * mat[1][2]
+        sqshrun         v4.4h,  v4.4s,  #12         // clip(G[0..2] >> 12)
+        umlal           v5.4s,  v2.4h,  v7.h[0]     // B[0..2] += gamma.in[Z[0..2] >> 4] * mat[2][2]
+        sqshrun         v5.4h,  v5.4s,  #12         // clip(B[0..2] >> 12)
+
+        umov            w9,  v3.h[0]                // clip(R[0] >> 12)
+        umov            w10, v4.h[0]                // clip(G[0] >> 12)
+        umov            w11, v5.h[0]                // clip(B[0] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[0] = gamma.out[clip(R[0] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G[0] = gamma.out[clip(G[0] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B[0] = gamma.out[clip(B[0] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R[0] << 4
+        lsl             w10, w10, #4                // w10 = G[0] << 4
+        lsl             w11, w11, #4                // w11 = B[0] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        add             x1,  x1,  #6
+
+        cmp             w0,  #-2
+        b.lt            10f                         // (w & 3) == 1
+
+        umov            w9,  v3.h[1]                // clip(R[1] >> 12)
+        umov            w10, v4.h[1]                // clip(G[1] >> 12)
+        umov            w11, v5.h[1]                // clip(B[1] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[1] = gamma.out[clip(R[1] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G[1] = gamma.out[clip(G[1] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B[1] = gamma.out[clip(B[1] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R[1] << 4
+        lsl             w10, w10, #4                // w10 = G[1] << 4
+        lsl             w11, w11, #4                // w11 = B[1] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        add             x1,  x1,  #6
+
+        b.le            10f                         // (w & 3) == 2
+
+        umov            w9,  v3.h[2]                // clip(R[2] >> 12)
+        umov            w10, v4.h[2]                // clip(G[2] >> 12)
+        umov            w11, v5.h[2]                // clip(B[2] >> 12)
+        ldrh            w9,  [x8, x9,  lsl #1]      // R[2] = gamma.out[clip(R[2] >> 12)]
+        ldrh            w10, [x8, x10, lsl #1]      // G[2] = gamma.out[clip(G[2] >> 12)]
+        ldrh            w11, [x8, x11, lsl #1]      // B[2] = gamma.out[clip(B[2] >> 12)]
+        lsl             w9,  w9,  #4                // w9  = R[2] << 4
+        lsl             w10, w10, #4                // w10 = G[2] << 4
+        lsl             w11, w11, #4                // w11 = B[2] << 4
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+        strh            w11, [x1, #4]
+        add             x1,  x1,  #6
+
+        .align JUMP_ALIGN
+10:
+        ret
+endfunc
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 95a61a4183..96df4ed3f4 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -861,6 +861,10 @@ av_cold void ff_sws_init_xyzdsp(SwsInternal *c)
 {
     c->xyz12Torgb48 = xyz12Torgb48_c;
     c->rgb48Toxyz12 = rgb48Toxyz12_c;
+
+#if ARCH_AARCH64
+    ff_sws_init_xyzdsp_aarch64(c);
+#endif
 }
 
 void ff_update_palette(SwsInternal *c, const uint32_t *pal)
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 02e625a10e..5c58272664 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -732,6 +732,8 @@ av_cold void ff_sws_init_range_convert_riscv(SwsInternal *c);
 av_cold void ff_sws_init_range_convert_x86(SwsInternal *c);
 
 av_cold void ff_sws_init_xyzdsp(SwsInternal *c);
+av_cold void ff_sws_init_xyzdsp_aarch64(SwsInternal *c);
+
 av_cold int ff_sws_fill_xyztables(SwsInternal *c);
 
 SwsFunc ff_yuv2rgb_init_x86(SwsInternal *c);