Files
ffmpeg/libavcodec/aarch64/vvc/inter.S
Georgii Zagoruiko f790de2a87 aarch64/vvc: Optimisations of put_luma_h() functions for 10/12-bit
RPi4 (auto-vectorisation is turned on)
put_luma_h_10_4x4_c:                                   282.8 ( 1.00x)
put_luma_h_10_8x8_c:                                  1069.5 ( 1.00x)
put_luma_h_10_8x8_neon:                                207.5 ( 5.15x)
put_luma_h_10_16x16_c:                                1999.6 ( 1.00x)
put_luma_h_10_16x16_neon:                              777.5 ( 2.57x)
put_luma_h_10_32x32_c:                                6612.9 ( 1.00x)
put_luma_h_10_32x32_neon:                             3201.6 ( 2.07x)
put_luma_h_10_64x64_c:                               25059.0 ( 1.00x)
put_luma_h_10_64x64_neon:                            13623.5 ( 1.84x)
put_luma_h_10_128x128_c:                             91310.1 ( 1.00x)
put_luma_h_10_128x128_neon:                          50358.3 ( 1.81x)
put_luma_h_12_4x4_c:                                   282.1 ( 1.00x)
put_luma_h_12_8x8_c:                                  1068.4 ( 1.00x)
put_luma_h_12_8x8_neon:                                207.7 ( 5.14x)
put_luma_h_12_16x16_c:                                1998.0 ( 1.00x)
put_luma_h_12_16x16_neon:                              777.5 ( 2.57x)
put_luma_h_12_32x32_c:                                6612.0 ( 1.00x)
put_luma_h_12_32x32_neon:                             3201.6 ( 2.07x)
put_luma_h_12_64x64_c:                               25036.8 ( 1.00x)
put_luma_h_12_64x64_neon:                            13595.1 ( 1.84x)
put_luma_h_12_128x128_c:                             91305.8 ( 1.00x)
put_luma_h_12_128x128_neon:                          50359.7 ( 1.81x)

Apple M2 Air (auto-vectorisation is turned on)
put_luma_h_10_4x4_c:                                     0.3 ( 1.00x)
put_luma_h_10_8x8_c:                                     1.0 ( 1.00x)
put_luma_h_10_8x8_neon:                                  0.4 ( 2.59x)
put_luma_h_10_16x16_c:                                   2.9 ( 1.00x)
put_luma_h_10_16x16_neon:                                1.4 ( 2.01x)
put_luma_h_10_32x32_c:                                   9.4 ( 1.00x)
put_luma_h_10_32x32_neon:                                5.8 ( 1.62x)
put_luma_h_10_64x64_c:                                  35.6 ( 1.00x)
put_luma_h_10_64x64_neon:                               23.6 ( 1.51x)
put_luma_h_10_128x128_c:                               131.1 ( 1.00x)
put_luma_h_10_128x128_neon:                             92.6 ( 1.42x)
put_luma_h_12_4x4_c:                                     0.3 ( 1.00x)
put_luma_h_12_8x8_c:                                     1.0 ( 1.00x)
put_luma_h_12_8x8_neon:                                  0.4 ( 2.58x)
put_luma_h_12_16x16_c:                                   2.9 ( 1.00x)
put_luma_h_12_16x16_neon:                                1.4 ( 2.00x)
put_luma_h_12_32x32_c:                                   9.4 ( 1.00x)
put_luma_h_12_32x32_neon:                                5.8 ( 1.61x)
put_luma_h_12_64x64_c:                                  35.3 ( 1.00x)
put_luma_h_12_64x64_neon:                               23.3 ( 1.52x)
put_luma_h_12_128x128_c:                               131.2 ( 1.00x)
put_luma_h_12_128x128_neon:                             92.4 ( 1.42x)
2025-11-24 21:22:55 +00:00

1835 lines
62 KiB
ArmAsm

/*
* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#define VVC_MAX_PB_SIZE 128
#define BDOF_BLOCK_SIZE 16
#define BDOF_MIN_BLOCK_SIZE 4
.macro vvc_w_avg bit_depth
.macro vvc_w_avg_\bit_depth\()_2_4 tap
.if \tap == 2
ldr s0, [src0]
ldr s2, [src1]
.else
ldr d0, [src0]
ldr d2, [src1]
.endif
mov v4.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
sqshl v4.4s, v4.4s, v22.4s
sqxtun v4.4h, v4.4s
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
.if \tap == 2
str h4, [dst]
.else // tap == 4
str s4, [dst]
.endif
.else // bit_depth > 8
umin v4.4h, v4.4h, v17.4h
.if \tap == 2
str s4, [dst]
.else
str d4, [dst]
.endif
.endif
add src0, src0, x10
add src1, src1, x10
add dst, dst, dst_stride
.endm
function ff_vvc_w_avg_\bit_depth\()_neon, export=1
dst .req x0
dst_stride .req x1
src0 .req x2
src1 .req x3
width .req w4
height .req w5
mov x10, #(VVC_MAX_PB_SIZE * 2)
cmp width, #8
lsr x11, x6, #32 // weight0
mov w12, w6 // weight1
lsr x13, x7, #32 // offset
mov w14, w7 // shift
dup v19.8h, w11
neg w14, w14 // so we can use sqshl
dup v20.8h, w12
dup v16.4s, w13
dup v22.4s, w14
.if \bit_depth >= 10
// clip pixel
mov w6, #((1 << \bit_depth) - 1)
dup v17.8h, w6
.endif
b.eq 8f
b.hi 16f
cmp width, #4
b.eq 4f
2: // width == 2
subs height, height, #1
vvc_w_avg_\bit_depth\()_2_4 2
b.ne 2b
b 32f
4: // width == 4
subs height, height, #1
vvc_w_avg_\bit_depth\()_2_4 4
b.ne 4b
b 32f
8: // width == 8
ld1 {v0.8h}, [src0], x10
ld1 {v2.8h}, [src1], x10
mov v4.16b, v16.16b
mov v5.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
smlal2 v5.4s, v0.8h, v19.8h
smlal2 v5.4s, v2.8h, v20.8h
sqshl v4.4s, v4.4s, v22.4s
sqshl v5.4s, v5.4s, v22.4s
sqxtun v4.4h, v4.4s
sqxtun2 v4.8h, v5.4s
subs height, height, #1
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
st1 {v4.8b}, [dst], dst_stride
.else
umin v4.8h, v4.8h, v17.8h
st1 {v4.8h}, [dst], dst_stride
.endif
b.ne 8b
b 32f
16: // width >= 16
mov w6, width
mov x7, src0
mov x8, src1
mov x9, dst
17:
ldp q0, q1, [x7], #32
ldp q2, q3, [x8], #32
mov v4.16b, v16.16b
mov v5.16b, v16.16b
mov v6.16b, v16.16b
mov v7.16b, v16.16b
smlal v4.4s, v0.4h, v19.4h
smlal v4.4s, v2.4h, v20.4h
smlal2 v5.4s, v0.8h, v19.8h
smlal2 v5.4s, v2.8h, v20.8h
smlal v6.4s, v1.4h, v19.4h
smlal v6.4s, v3.4h, v20.4h
smlal2 v7.4s, v1.8h, v19.8h
smlal2 v7.4s, v3.8h, v20.8h
sqshl v4.4s, v4.4s, v22.4s
sqshl v5.4s, v5.4s, v22.4s
sqshl v6.4s, v6.4s, v22.4s
sqshl v7.4s, v7.4s, v22.4s
sqxtun v4.4h, v4.4s
sqxtun v6.4h, v6.4s
sqxtun2 v4.8h, v5.4s
sqxtun2 v6.8h, v7.4s
subs w6, w6, #16
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
sqxtun2 v4.16b, v6.8h
str q4, [x9], #16
.else
umin v4.8h, v4.8h, v17.8h
umin v6.8h, v6.8h, v17.8h
stp q4, q6, [x9], #32
.endif
b.ne 17b
subs height, height, #1
add src0, src0, x10
add src1, src1, x10
add dst, dst, dst_stride
b.ne 16b
32:
ret
.unreq dst
.unreq dst_stride
.unreq src0
.unreq src1
.unreq width
.unreq height
endfunc
.endm
vvc_w_avg 8
vvc_w_avg 10
vvc_w_avg 12
.macro vvc_avg bit_depth
function ff_vvc_avg_\bit_depth\()_neon, export=1
mov x10, #(VVC_MAX_PB_SIZE * 2)
movi v16.8h, #0
movi v17.16b, #255
ushr v17.8h, v17.8h, #(16 - \bit_depth)
cmp w4, #8
b.gt 16f
b.eq 8f
cmp w4, #4
b.eq 4f
2: // width == 2
ldr s0, [x2]
subs w5, w5, #1
ldr s1, [x3]
.if \bit_depth == 8
shadd v0.4h, v0.4h, v1.4h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str h0, [x0]
.else
shadd v0.4h, v0.4h, v1.4h
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
smax v0.4h, v0.4h, v16.4h
smin v0.4h, v0.4h, v17.4h
str s0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 2b
ret
4: // width == 4
ldr d0, [x2]
subs w5, w5, #1
ldr d1, [x3]
.if \bit_depth == 8
shadd v0.4h, v0.4h, v1.4h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str s0, [x0]
.else
shadd v0.4h, v0.4h, v1.4h
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
smax v0.4h, v0.4h, v16.4h
smin v0.4h, v0.4h, v17.4h
str d0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 4b
ret
8: // width == 8
ldr q0, [x2]
subs w5, w5, #1
ldr q1, [x3]
.if \bit_depth == 8
shadd v0.8h, v0.8h, v1.8h
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
str d0, [x0]
.else
shadd v0.8h, v0.8h, v1.8h
srshr v0.8h, v0.8h, #(15 - 1 - \bit_depth)
smax v0.8h, v0.8h, v16.8h
smin v0.8h, v0.8h, v17.8h
str q0, [x0]
.endif
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
add x0, x0, x1
b.ne 8b
ret
16: // width >= 16
.if \bit_depth == 8
sub x1, x1, w4, sxtw
.else
sub x1, x1, w4, sxtw #1
.endif
sub x10, x10, w4, sxtw #1
3:
mov w6, w4 // width
1:
ldp q0, q1, [x2], #32
subs w6, w6, #16
ldp q2, q3, [x3], #32
.if \bit_depth == 8
shadd v4.8h, v0.8h, v2.8h
shadd v5.8h, v1.8h, v3.8h
sqrshrun v0.8b, v4.8h, #6
sqrshrun2 v0.16b, v5.8h, #6
st1 {v0.16b}, [x0], #16
.else
shadd v4.8h, v0.8h, v2.8h
shadd v5.8h, v1.8h, v3.8h
srshr v0.8h, v4.8h, #(15 - 1 - \bit_depth)
srshr v1.8h, v5.8h, #(15 - 1 - \bit_depth)
smax v0.8h, v0.8h, v16.8h
smax v1.8h, v1.8h, v16.8h
smin v0.8h, v0.8h, v17.8h
smin v1.8h, v1.8h, v17.8h
stp q0, q1, [x0], #32
.endif
b.ne 1b
subs w5, w5, #1
add x2, x2, x10
add x3, x3, x10
add x0, x0, x1
b.ne 3b
ret
endfunc
.endm
vvc_avg 8
vvc_avg 10
vvc_avg 12
/* x0: int16_t *dst
* x1: const uint8_t *_src
* x2: ptrdiff_t _src_stride
* w3: int height
* x4: intptr_t mx
* x5: intptr_t my
* w6: int width
*/
function ff_vvc_dmvr_8_neon, export=1
dst .req x0
src .req x1
src_stride .req x2
height .req w3
mx .req x4
my .req x5
width .req w6
sxtw x6, w6
mov x7, #(VVC_MAX_PB_SIZE * 2 + 8)
cmp width, #16
sub src_stride, src_stride, x6
cset w15, gt // width > 16
movi v16.8h, #2 // DMVR_SHIFT
sub x7, x7, x6, lsl #1
1:
cbz w15, 2f
ldr q0, [src], #16
ushll v1.8h, v0.8b, #2
ushll2 v2.8h, v0.16b, #2
stp q1, q2, [dst], #32
b 3f
2:
ldr d0, [src], #8
ushll v1.8h, v0.8b, #2
str q1, [dst], #16
3:
subs height, height, #1
ldr s3, [src], #4
ushll v4.8h, v3.8b, #2
st1 {v4.4h}, [dst], x7
add src, src, src_stride
b.ne 1b
ret
endfunc
function ff_vvc_dmvr_12_neon, export=1
sxtw x6, w6
mov x7, #(VVC_MAX_PB_SIZE * 2 + 8)
cmp width, #16
sub src_stride, src_stride, x6, lsl #1
cset w15, gt // width > 16
sub x7, x7, x6, lsl #1
1:
cbz w15, 2f
ldp q0, q1, [src], #32
urshr v0.8h, v0.8h, #2
urshr v1.8h, v1.8h, #2
stp q0, q1, [dst], #32
b 3f
2:
ldr q0, [src], #16
urshr v0.8h, v0.8h, #2
str q0, [dst], #16
3:
subs height, height, #1
ldr d0, [src], #8
urshr v0.4h, v0.4h, #2
st1 {v0.4h}, [dst], x7
add src, src, src_stride
b.ne 1b
ret
endfunc
function ff_vvc_dmvr_v_8_neon, export=1
movrel x7, X(ff_vvc_inter_luma_dmvr_filters)
add x7, x7, x5, lsl #1
ld2r {v0.16b, v1.16b}, [x7]
tbz w6, #4, 12f
ldr s16, [x1, #16]
ld1 {v2.16b}, [x1], x2
20:
ldr s17, [x1, #16]
umull v4.8h, v0.8b, v2.8b
umull2 v5.8h, v0.16b, v2.16b
ld1 {v3.16b}, [x1], x2
umull v16.8h, v0.8b, v16.8b
umull v6.8h, v1.8b, v3.8b
umull2 v7.8h, v1.16b, v3.16b
add v4.8h, v4.8h, v6.8h
umull v18.8h, v1.8b, v17.8b
add v5.8h, v5.8h, v7.8h
urshr v4.8h, v4.8h, #2
add v19.4h, v16.4h, v18.4h
urshr v5.8h, v5.8h, #2
urshr v19.4h, v19.4h, #2
st1 {v4.8h, v5.8h}, [x0], #32
subs w3, w3, #1
mov v2.16b, v3.16b
st1 {v19.4h}, [x0], #8
mov v16.16b, v17.16b
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
b.ne 20b
ret
12:
ldr s16, [x1, #8]
ld1 {v2.8b}, [x1], x2
2:
ldr s17, [x1, #8]
umull v4.8h, v0.8b, v2.8b
ld1 {v3.8b}, [x1], x2
umull v16.8h, v0.8b, v16.8b
umull v6.8h, v1.8b, v3.8b
add v4.8h, v4.8h, v6.8h
umull v18.8h, v1.8b, v17.8b
srshr v4.8h, v4.8h, #2
add v19.4h, v16.4h, v18.4h
srshr v19.4h, v19.4h, #2
st1 {v4.8h}, [x0], #16
subs w3, w3, #1
mov v2.16b, v3.16b
st1 {v19.4h}, [x0], #8
mov v16.16b, v17.16b
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
b.ne 2b
ret
endfunc
function ff_vvc_dmvr_h_8_neon, export=1
movrel x7, X(ff_vvc_inter_luma_dmvr_filters)
add x7, x7, x4, lsl #1
ld2r {v0.16b, v1.16b}, [x7]
tbz w6, #4, 12f
20:
ldur q3, [x1, #1]
ldr q2, [x1]
umull v4.8h, v0.8b, v2.8b
umull2 v5.8h, v0.16b, v2.16b
ldur s17, [x1, #17]
umull v6.8h, v1.8b, v3.8b
ldr s16, [x1, #16]
umull2 v7.8h, v1.16b, v3.16b
add v4.8h, v4.8h, v6.8h
umull v17.8h, v1.8b, v17.8b
add v5.8h, v5.8h, v7.8h
umull v16.8h, v0.8b, v16.8b
srshr v4.8h, v4.8h, #2
add v16.4h, v16.4h, v17.4h
srshr v5.8h, v5.8h, #2
srshr v16.4h, v16.4h, #2
st1 {v4.8h, v5.8h}, [x0], #32
subs w3, w3, #1
st1 {v16.4h}, [x0], #8
add x1, x1, x2
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
b.ne 20b
ret
12:
ldur d3, [x1, #1]
ldr d2, [x1]
umull v4.8h, v0.8b, v2.8b
ldur s17, [x1, #9]
umull v6.8h, v1.8b, v3.8b
ldr s16, [x1, #8]
add v4.8h, v4.8h, v6.8h
umull v17.8h, v1.8b, v17.8b
umull v16.8h, v0.8b, v16.8b
srshr v4.8h, v4.8h, #2
add v16.4h, v16.4h, v17.4h
srshr v16.4h, v16.4h, #2
st1 {v4.8h}, [x0], #16
subs w3, w3, #1
st1 {v16.4h}, [x0], #8
add x1, x1, x2
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
b.ne 12b
ret
endfunc
.macro vvc_dmvr_h_10 bit_depth
function ff_vvc_dmvr_h_\bit_depth\()_neon, export=1
movrel x7, X(ff_vvc_inter_luma_dmvr_filters)
add x7, x7, x4, lsl #1
ld2r {v0.16b, v1.16b}, [x7]
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
tbz w6, #4, 12f
20:
ldur q3, [x1, #2]
ldr q2, [x1]
ldr q22, [x1, #16]
mul v4.8h, v0.8h, v2.8h
mul v6.8h, v1.8h, v3.8h
ldur q23, [x1, #18]
mul v5.8h, v0.8h, v22.8h
ldur d17, [x1, #34]
mul v7.8h, v1.8h, v23.8h
uhadd v4.8h, v4.8h, v6.8h
ldr d16, [x1, #32]
uhadd v5.8h, v5.8h, v7.8h
mul v17.4h, v1.4h, v17.4h
mul v16.4h, v0.4h, v16.4h
urshr v4.8h, v4.8h, #(\bit_depth - 6 - 1)
urshr v5.8h, v5.8h, #(\bit_depth - 6 - 1)
uhadd v16.4h, v16.4h, v17.4h
urshr v16.4h, v16.4h, #(\bit_depth - 6 - 1)
st1 {v4.8h, v5.8h}, [x0], #32
subs w3, w3, #1
st1 {v16.4h}, [x0], #8
add x1, x1, x2
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
b.ne 20b
ret
12:
ldur q3, [x1, #2]
ldr q2, [x1]
mul v4.8h, v0.8h, v2.8h
ldur d17, [x1, #18]
mul v6.8h, v1.8h, v3.8h
ldr d16, [x1, #16]
uhadd v4.8h, v4.8h, v6.8h
mul v17.4h, v1.4h, v17.4h
mul v16.4h, v0.4h, v16.4h
urshr v4.8h, v4.8h, #(\bit_depth - 6 - 1)
uhadd v16.4h, v16.4h, v17.4h
urshr v16.4h, v16.4h, #(\bit_depth - 6 - 1)
st1 {v4.8h}, [x0], #16
subs w3, w3, #1
st1 {v16.4h}, [x0], #8
add x1, x1, x2
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
b.ne 12b
ret
endfunc
.endm
vvc_dmvr_h_10 10
vvc_dmvr_h_10 12
function ff_vvc_dmvr_hv_8_neon, export=1
tmp0 .req x7
tmp1 .req x8
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
add x12, x9, mx, lsl #1
mov tmp0, sp
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
// We know the value are positive
ld2r {v0.16b, v1.16b}, [x12]
add x12, x9, my, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
sxtw x6, w6
dup v2.8h, w10 // filter_y[0]
dup v3.8h, w11 // filter_y[1]
// Valid value for width can only be 8 + 4, 16 + 4
cmp width, #16
mov w10, #0 // start filter_y or not
add height, height, #1
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
sub src_stride, src_stride, x6
cset w15, gt // width > 16
1:
mov x12, tmp0
mov x13, tmp1
mov x14, dst
cbz w15, 2f
// width > 16
ldur q5, [src, #1]
ldr q4, [src], #16
umull v6.8h, v4.8b, v0.8b
umull2 v16.8h, v4.16b, v0.16b
umlal v6.8h, v5.8b, v1.8b
umlal2 v16.8h, v5.16b, v1.16b
urshr v6.8h, v6.8h, #(8 - 6)
urshr v7.8h, v16.8h, #(8 - 6)
stp q6, q7, [x13], #32
cbz w10, 3f
ldp q16, q17, [x12], #32
mul v16.8h, v16.8h, v2.8h
mul v17.8h, v17.8h, v2.8h
mla v16.8h, v6.8h, v3.8h
mla v17.8h, v7.8h, v3.8h
urshr v16.8h, v16.8h, #4
urshr v17.8h, v17.8h, #4
stp q16, q17, [x14], #32
b 3f
2:
// width > 8
ldur d5, [src, #1]
ldr d4, [src], #8
umull v6.8h, v4.8b, v0.8b
umlal v6.8h, v5.8b, v1.8b
urshr v6.8h, v6.8h, #(8 - 6)
str q6, [x13], #16
cbz w10, 3f
ldr q16, [x12], #16
mul v16.8h, v16.8h, v2.8h
mla v16.8h, v6.8h, v3.8h
urshr v16.8h, v16.8h, #4
str q16, [x14], #16
3:
ldur s5, [src, #1]
ldr s4, [src], #4
umull v6.8h, v4.8b, v0.8b
umlal v6.8h, v5.8b, v1.8b
urshr v6.4h, v6.4h, #(8 - 6)
str d6, [x13], #8
cbz w10, 4f
ldr d16, [x12], #8
mul v16.4h, v16.4h, v2.4h
mla v16.4h, v6.4h, v3.4h
urshr v16.4h, v16.4h, #4
str d16, [x14], #8
4:
subs height, height, #1
mov w10, #1
add src, src, src_stride
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
eor tmp0, tmp0, tmp1
eor tmp1, tmp0, tmp1
eor tmp0, tmp0, tmp1
b.ne 1b
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
ret
endfunc
function ff_vvc_dmvr_hv_12_neon, export=1
mvni v29.4s, #(12 - 6 - 1)
b 0f
endfunc
function ff_vvc_dmvr_hv_10_neon, export=1
mvni v29.4s, #(10 - 6 - 1)
0:
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
add x12, x9, mx, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
mov tmp0, sp
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
// We know the value are positive
dup v0.8h, w10 // filter_x[0]
dup v1.8h, w11 // filter_x[1]
add x12, x9, my, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
dup v2.8h, w10 // filter_y[0]
dup v3.8h, w11 // filter_y[1]
// Valid value for width can only be 8 + 4, 16 + 4
cmp width, #16
mov w10, #0 // start filter_y or not
add height, height, #1
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
sub src_stride, src_stride, w6, sxtw #1
cset w15, gt // width > 16
1:
mov x12, tmp0
mov x13, tmp1
mov x14, dst
cbz w15, 2f
// width > 16
add x16, src, #2
ldp q6, q16, [src], #32
ldp q7, q17, [x16]
umull v4.4s, v6.4h, v0.4h
umull2 v5.4s, v6.8h, v0.8h
umull v18.4s, v16.4h, v0.4h
umull2 v19.4s, v16.8h, v0.8h
umlal v4.4s, v7.4h, v1.4h
umlal2 v5.4s, v7.8h, v1.8h
umlal v18.4s, v17.4h, v1.4h
umlal2 v19.4s, v17.8h, v1.8h
urshl v4.4s, v4.4s, v29.4s
urshl v5.4s, v5.4s, v29.4s
urshl v18.4s, v18.4s, v29.4s
urshl v19.4s, v19.4s, v29.4s
uqxtn v6.4h, v4.4s
uqxtn2 v6.8h, v5.4s
uqxtn v7.4h, v18.4s
uqxtn2 v7.8h, v19.4s
stp q6, q7, [x13], #32
cbz w10, 3f
ldp q4, q5, [x12], #32
umull v17.4s, v4.4h, v2.4h
umull2 v18.4s, v4.8h, v2.8h
umull v19.4s, v5.4h, v2.4h
umull2 v20.4s, v5.8h, v2.8h
umlal v17.4s, v6.4h, v3.4h
umlal2 v18.4s, v6.8h, v3.8h
umlal v19.4s, v7.4h, v3.4h
umlal2 v20.4s, v7.8h, v3.8h
uqrshrn v6.4h, v17.4s, #4
uqrshrn2 v6.8h, v18.4s, #4
uqrshrn v7.4h, v19.4s, #4
uqrshrn2 v7.8h, v20.4s, #4
stp q6, q7, [x14], #32
b 3f
2:
// width > 8
ldur q7, [src, #2]
ldr q6, [src], #16
umull v4.4s, v6.4h, v0.4h
umull2 v5.4s, v6.8h, v0.8h
umlal v4.4s, v7.4h, v1.4h
umlal2 v5.4s, v7.8h, v1.8h
urshl v4.4s, v4.4s, v29.4s
urshl v5.4s, v5.4s, v29.4s
uqxtn v6.4h, v4.4s
uqxtn2 v6.8h, v5.4s
str q6, [x13], #16
cbz w10, 3f
ldr q16, [x12], #16
umull v17.4s, v16.4h, v2.4h
umull2 v18.4s, v16.8h, v2.8h
umlal v17.4s, v6.4h, v3.4h
umlal2 v18.4s, v6.8h, v3.8h
urshr v17.4s, v17.4s, #4
urshr v18.4s, v18.4s, #4
uqxtn v16.4h, v17.4s
uqxtn2 v16.8h, v18.4s
str q16, [x14], #16
3:
ldur d7, [src, #2]
ldr d6, [src], #8
umull v4.4s, v7.4h, v1.4h
umlal v4.4s, v6.4h, v0.4h
urshl v4.4s, v4.4s, v29.4s
uqxtn v6.4h, v4.4s
str d6, [x13], #8
cbz w10, 4f
ldr d16, [x12], #8
umull v17.4s, v16.4h, v2.4h
umlal v17.4s, v6.4h, v3.4h
urshr v17.4s, v17.4s, #4
uqxtn v16.4h, v17.4s
str d16, [x14], #8
4:
subs height, height, #1
mov w10, #1
add src, src, src_stride
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
eor tmp0, tmp0, tmp1
eor tmp1, tmp0, tmp1
eor tmp0, tmp0, tmp1
b.ne 1b
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
ret
.unreq dst
.unreq src
.unreq src_stride
.unreq height
.unreq mx
.unreq my
.unreq width
.unreq tmp0
.unreq tmp1
endfunc
function ff_vvc_prof_grad_filter_8x_neon, export=1
gh .req x0
gv .req x1
gstride .req x2
src .req x3
src_stride .req x4
width .req w5
height .req w6
lsl src_stride, src_stride, #1
neg x7, src_stride
1:
mov x10, src
mov w11, width
mov x12, gh
mov x13, gv
2:
ldur q0, [x10, #2]
ldur q1, [x10, #-2]
subs w11, w11, #8
ldr q2, [x10, src_stride]
ldr q3, [x10, x7]
sshr v0.8h, v0.8h, #6
sshr v1.8h, v1.8h, #6
sshr v2.8h, v2.8h, #6
sshr v3.8h, v3.8h, #6
sub v0.8h, v0.8h, v1.8h
sub v2.8h, v2.8h, v3.8h
st1 {v0.8h}, [x12], #16
st1 {v2.8h}, [x13], #16
add x10, x10, #16
b.ne 2b
subs height, height, #1
add gh, gh, gstride, lsl #1
add gv, gv, gstride, lsl #1
add src, src, src_stride
b.ne 1b
ret
.unreq gh
.unreq gv
.unreq gstride
.unreq src
.unreq src_stride
.unreq width
.unreq height
endfunc
function vvc_bdof_grad_filter_8x_neon, export=0
gh0 .req x0
gh1 .req x1
gv0 .req x2
gv1 .req x3
src0 .req x4
src1 .req x5
width .req w6
height .req w7
tbnz w6, #4, 16f
8:
ldur q0, [src0, #2]
ldur q1, [src0, #-2]
ldr q2, [src0, #(VVC_MAX_PB_SIZE << 1)]
ldr q3, [src0, #-(VVC_MAX_PB_SIZE << 1)]
sshr v0.8h, v0.8h, #6
sshr v1.8h, v1.8h, #6
ldur q4, [src1, #2]
ldur q5, [src1, #-2]
sshr v2.8h, v2.8h, #6
sshr v3.8h, v3.8h, #6
ldr q6, [src1, #(VVC_MAX_PB_SIZE << 1)]
ldr q7, [src1, #-(VVC_MAX_PB_SIZE << 1)]
// results of gradient_h0
sub v0.8h, v0.8h, v1.8h
// results of gradient_v0
sub v2.8h, v2.8h, v3.8h
sshr v4.8h, v4.8h, #6
sshr v5.8h, v5.8h, #6
sshr v6.8h, v6.8h, #6
sshr v7.8h, v7.8h, #6
// results of gradient_h1
sub v4.8h, v4.8h, v5.8h
// results of gradient_v1
sub v6.8h, v6.8h, v7.8h
// (gradient_h0 + gradient_h1) >> 1
shadd v1.8h, v0.8h, v4.8h
// gradient_h0 - gradient_h1
sub v5.8h, v0.8h, v4.8h
// (gradient_v0 + gradient_v1) >> 1
shadd v3.8h, v2.8h, v6.8h
// gradient_v0 - gradient_v1
sub v7.8h, v2.8h, v6.8h
st1 {v1.8h}, [gh0]
st1 {v5.8h}, [gh1]
st1 {v3.8h}, [gv0]
st1 {v7.8h}, [gv1]
subs height, height, #1
add gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
add gv0, gv0, #(BDOF_BLOCK_SIZE << 1)
add src0, src0, #(VVC_MAX_PB_SIZE << 1)
add gh1, gh1, #(BDOF_BLOCK_SIZE << 1)
add gv1, gv1, #(BDOF_BLOCK_SIZE << 1)
add src1, src1, #(VVC_MAX_PB_SIZE << 1)
b.ne 8b
ret
16:
ldur q0, [src0, #2]
ldur q1, [src0, #18]
ldur q16, [src0, #-2]
sshr v0.8h, v0.8h, #6
ldur q17, [src0, #14]
sshr v1.8h, v1.8h, #6
ldp q18, q19, [src0, #-(VVC_MAX_PB_SIZE << 1)]
sshr v16.8h, v16.8h, #6
ldp q2, q3, [src0, #(VVC_MAX_PB_SIZE << 1)]!
ldur q20, [src1, #2]
sshr v17.8h, v17.8h, #6
ldur q21, [src1, #18]
sshr v2.8h, v2.8h, #6
ldur q22, [src1, #-2]
sshr v3.8h, v3.8h, #6
ldur q23, [src1, #14]
sshr v18.8h, v18.8h, #6
ldp q26, q27, [src1, #-(VVC_MAX_PB_SIZE << 1)]
sshr v19.8h, v19.8h, #6
ldp q24, q25, [src1, #(VVC_MAX_PB_SIZE << 1)]!
// results of gradient_h0
sub v0.8h, v0.8h, v16.8h
sub v1.8h, v1.8h, v17.8h
// results of gradient_v0
sub v2.8h, v2.8h, v18.8h
sub v3.8h, v3.8h, v19.8h
sshr v20.8h, v20.8h, #6
sshr v21.8h, v21.8h, #6
sshr v22.8h, v22.8h, #6
sshr v23.8h, v23.8h, #6
// results of gradient_h1
sub v20.8h, v20.8h, v22.8h
sub v21.8h, v21.8h, v23.8h
sshr v24.8h, v24.8h, #6
sshr v25.8h, v25.8h, #6
// gradient_h0 - gradient_h1
sub v22.8h, v0.8h, v20.8h
sub v23.8h, v1.8h, v21.8h
// (gradient_h0 + gradient_h1) >> 1
shadd v16.8h, v0.8h, v20.8h
shadd v17.8h, v1.8h, v21.8h
st1 {v22.8h, v23.8h}, [gh1], #32
sshr v26.8h, v26.8h, #6
sshr v27.8h, v27.8h, #6
st1 {v16.8h, v17.8h}, [gh0], #32
// results of gradient_v1
sub v24.8h, v24.8h, v26.8h
sub v25.8h, v25.8h, v27.8h
// (gradient_v0 + gradient_v1) >> 1
shadd v18.8h, v2.8h, v24.8h
shadd v19.8h, v3.8h, v25.8h
// gradient_v0 - gradient_v1
sub v26.8h, v2.8h, v24.8h
sub v27.8h, v3.8h, v25.8h
st1 {v18.8h,v19.8h}, [gv0], #32
subs height, height, #1
st1 {v26.8h,v27.8h}, [gv1], #32
b.ne 16b
ret
.unreq gh0
.unreq gh1
.unreq gv0
.unreq gv1
.unreq src0
.unreq src1
.unreq width
.unreq height
endfunc
.macro vvc_apply_bdof_block_8x bit_depth
dst .req x0
dst_stride .req x1
src0 .req x2
src1 .req x3
gh .req x4
gv .req x5
vx .req x6
vy .req x7
ldr w8, [sp]
mov x12, #(BDOF_BLOCK_SIZE * 2)
mov x14, #(VVC_MAX_PB_SIZE * 2)
.if \bit_depth >= 10
// clip pixel
mov w15, #((1 << \bit_depth) - 1)
dup v19.8h, w15
.endif
0:
ldr s0, [vx], #(2 * BDOF_MIN_BLOCK_SIZE)
ldr s1, [vy], #(2 * BDOF_MIN_BLOCK_SIZE)
mov w13, #(BDOF_MIN_BLOCK_SIZE)
1:
ld1 {v5.8h}, [src0], x14
ld1 {v6.8h}, [src1], x14
saddl v17.4s, v5.4h, v6.4h
ld1 {v4.8h}, [gv], x12
saddl2 v16.4s, v5.8h, v6.8h
ld1 {v2.8h}, [gh], x12
smlal v17.4s, v4.4h, v1.h[0]
smlal2 v16.4s, v4.8h, v1.h[1]
smlal v17.4s, v2.4h, v0.h[0]
smlal2 v16.4s, v2.8h, v0.h[1]
sqrshrun v5.4h, v17.4s, #(15 - \bit_depth)
sqrshrun2 v5.8h, v16.4s, #(15 - \bit_depth)
subs w13, w13, #1
.if \bit_depth == 8
sqxtun v5.8b, v5.8h
st1 {v5.8b}, [dst], dst_stride
.else
smin v5.8h, v5.8h, v19.8h
st1 {v5.8h}, [dst], dst_stride
.endif
b.ne 1b
subs w8, w8, #(BDOF_MIN_BLOCK_SIZE)
b.ne 0b
ret
.unreq dst
.unreq dst_stride
.unreq src0
.unreq src1
.unreq gh
.unreq gv
.unreq vx
.unreq vy
.endm
function vvc_apply_bdof_block_8x_8_neon, export=0
vvc_apply_bdof_block_8x 8
endfunc
function vvc_apply_bdof_block_8x_10_neon, export=0
vvc_apply_bdof_block_8x 10
endfunc
function vvc_apply_bdof_block_8x_12_neon, export=0
vvc_apply_bdof_block_8x 12
endfunc
.macro vvc_apply_bdof_block_16x bit_depth
dst .req x0
dst_stride .req x1
src0 .req x2
src1 .req x3
gh .req x4
gv .req x5
vx .req x6
vy .req x7
ldr w8, [sp]
movi v7.4s, #(1 << (14 - \bit_depth))
.if \bit_depth >= 10
// clip pixel
mov w15, #((1 << \bit_depth) - 1)
movi v18.8h, #0
dup v19.8h, w15
.endif
0:
ld1r {v0.8h}, [vx], #2
ld1r {v1.8h}, [vy], #2
ld1r {v2.8h}, [vx], #2
ld1r {v3.8h}, [vy], #2
mov w13, #(BDOF_MIN_BLOCK_SIZE)
ld1r {v20.8h}, [vx], #2
ld1r {v21.8h}, [vy], #2
ld1r {v22.8h}, [vx], #2
ld1r {v23.8h}, [vy], #2
ins v0.d[1], v2.d[1]
ins v1.d[1], v3.d[1]
ins v20.d[1], v22.d[1]
ins v21.d[1], v23.d[1]
1:
ldp q2, q22, [gh], #(BDOF_BLOCK_SIZE * 2)
ldp q4, q24, [gv], #(BDOF_BLOCK_SIZE * 2)
smull v3.4s, v0.4h, v2.4h
smull2 v16.4s, v0.8h, v2.8h
smlal v3.4s, v1.4h, v4.4h
smlal2 v16.4s, v1.8h, v4.8h
ldp q5, q25, [src0], #(VVC_MAX_PB_SIZE * 2)
ldp q6, q26, [src1], #(VVC_MAX_PB_SIZE * 2)
smull v23.4s, v20.4h, v22.4h
smull2 v27.4s, v20.8h, v22.8h
smlal v23.4s, v21.4h, v24.4h
smlal2 v27.4s, v21.8h, v24.8h
saddl v2.4s, v5.4h, v6.4h
add v2.4s, v2.4s, v7.4s
add v2.4s, v2.4s, v3.4s
saddl2 v4.4s, v5.8h, v6.8h
add v4.4s, v4.4s, v7.4s
add v4.4s, v4.4s, v16.4s
saddl v22.4s, v25.4h, v26.4h
add v22.4s, v22.4s, v7.4s
add v22.4s, v22.4s, v23.4s
saddl2 v24.4s, v25.8h, v26.8h
add v24.4s, v24.4s, v7.4s
add v24.4s, v24.4s, v27.4s
sqshrn v5.4h, v2.4s, #(15 - \bit_depth)
sqshrn2 v5.8h, v4.4s, #(15 - \bit_depth)
sqshrn v25.4h, v22.4s, #(15 - \bit_depth)
sqshrn2 v25.8h, v24.4s, #(15 - \bit_depth)
subs w13, w13, #1
.if \bit_depth == 8
sqxtun v5.8b, v5.8h
sqxtun2 v5.16b, v25.8h
str q5, [dst]
.else
smin v5.8h, v5.8h, v19.8h
smax v5.8h, v5.8h, v18.8h
smin v25.8h, v25.8h, v19.8h
smax v25.8h, v25.8h, v18.8h
stp q5, q25, [dst]
.endif
add dst, dst, dst_stride
b.ne 1b
subs w8, w8, #(BDOF_MIN_BLOCK_SIZE)
b.ne 0b
ret
.unreq dst
.unreq dst_stride
.unreq src0
.unreq src1
.unreq gh
.unreq gv
.unreq vx
.unreq vy
.endm
function vvc_apply_bdof_block_16x_8_neon, export=0
vvc_apply_bdof_block_16x 8
endfunc
function vvc_apply_bdof_block_16x_10_neon, export=0
vvc_apply_bdof_block_16x 10
endfunc
function vvc_apply_bdof_block_16x_12_neon, export=0
vvc_apply_bdof_block_16x 12
endfunc
const bdof_vx_vy_8x_tbl
.byte 0, 1, 16, 16, 16, 16, 8, 9
.byte 6, 7, 16, 16, 16, 16, 14, 15
endconst
const bdof_vx_vy_16x_tbl
.byte 0, 1, 64, 64, 64, 64, 8, 9
.byte 6, 7, 64, 64, 64, 64, 16, 17
.byte 14, 15, 64, 64, 64, 64, 24, 25
.byte 22, 23, 64, 64, 64, 64, 30, 31
endconst
// line(-1), line0, line1, line2, line3, line4
// line3 and line4 becomes line(-1) and line0 in the next block.
.macro bdof_vx_vy_8x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
mov \tmp0\().16b, v28.16b
mov \tmp1\().16b, v29.16b
mov \tmp2\().16b, v30.16b
mov \tmp3\().16b, v31.16b
mov \tmp4\().16b, v8.16b
.endm
.macro bdof_vx_vy_8x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
add v25.4s, v25.4s, \tmp0\().4s
add v27.4s, v27.4s, \tmp1\().4s
add v23.4s, v23.4s, \tmp2\().4s
sub v26.4s, v26.4s, \tmp3\().4s
sub v24.4s, v24.4s, \tmp4\().4s
.endm
.macro bdof_vx_vy_8x_padding_left_right src, tmp0, tmp1, dst
tbl \tmp0\().16b, { \src\().16b }, v0.16b
saddl \tmp1\().4s, \tmp0\().4h, \src\().4h
saddl2 \dst\().4s, \tmp0\().8h, \src\().8h
addp \dst\().4s, \tmp1\().4s, \dst\().4s
.endm
.macro bdof_vx_vy_sign src, tmp0, tmp1, dst
cmlt \tmp0\().8h, \src\().8h, #0
cmgt \tmp1\().8h, \src\().8h, #0
sub \dst\().8h, \tmp0\().8h, \tmp1\().8h
.endm
.macro bdof_vx_vy_clip_mask src, max, min, mask, dst
smin \src\().4s, \src\().4s, \max\().4s
smax \src\().4s, \src\().4s, \min\().4s
cmgt \mask\().4s, \mask\().4s, #0
and \dst\().16b, \src\().16b, \mask\().16b
.endm
.macro bdof_vx_vy_16x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
mov \tmp0\().16b, v29.16b
mov \tmp1\().16b, v30.16b
mov \tmp2\().16b, v31.16b
mov \tmp3\().16b, v8.16b
mov \tmp4\().16b, v9.16b
.endm
.macro bdof_vx_vy_16x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
add v25.4s, v25.4s, \tmp0\().4s
add v24.4s, v24.4s, \tmp1\().4s
add v26.4s, v26.4s, \tmp2\().4s
sub v28.4s, v28.4s, \tmp3\().4s
sub v27.4s, v27.4s, \tmp4\().4s
.endm
.macro bdof_vx_vy_16x_padding_left_right src0, src1, tmp0, tmp1, tmp2, dst
tbl \tmp0\().16b, {\src0\().16b, \src1\().16b}, v0.16b
tbl v2.16b, {\src0\().16b, \src1\().16b}, v1.16b
saddl \tmp1\().4s, \tmp0\().4h, \src0\().4h
saddl \tmp2\().4s, v2.4h, \src1\().4h
saddl2 \tmp0\().4s, \tmp0\().8h, \src0\().8h
saddl2 \dst\().4s, v2.8h, \src1\().8h
addp \tmp0\().4s, \tmp1\().4s, \tmp0\().4s
addp \dst\().4s, \tmp2\().4s, \dst\().4s
addp \dst\().4s, \tmp0\().4s, \dst\().4s
.endm
/*
* Line tricks:
* We need 6 lines of information, from 4N-1, 4N, 4N+1 to 4N+4. 4N-1
* and 4N+0 are processed in the last group, so they can be reused.
*
* (4N-1) [xxxxxxxxxxxxx] <--- reuse
* (4N) [xxxxxxxxxxxxx] <--- reuse
* (4N+1) [xxxxxxxxxxxxx]
* (4N+2) [xxxxxxxxxxxxx]
* (4N+3) [xxxxxxxxxxxxx] ---> save for reuse
* (4N+4) [xxxxxxxxxxxxx] ---> save for reuse
*
* Special case:
* 1. Line -1 needs to duplicate line 0.
* 2. Last line +1 needs to duplicate the last line.
*
* ---------------------------------------------------------------------
* Pixel tricks:
*
* [C-1, C0, C1, C2, ... C16]
*
* For each line, we need to sum parameters for 4 * 6 pixels:
* - C-1 + C0 + C1 + C2 + C3 + C4
* - C3 + C4 + C5 + C6 + C7 + C8
* - C7 + C8 + C9 + C10 + C11 + C12
* - C11 + C12 + C13 + C14 + C15 + C16
*
* C-1 is C0, C16 is C15, so we can do:
*
* [C0, C1, C2, C3, | C4, C5, C6, C7, | C8, ... C15]
* + | + |
* [C0, 0, 0, C4, | C3, 0, 0, C8, | C7, ... C15]
*
* 8x is similar.
* ----------------------------------------------------------------------
* x0: const int16_t *_src0,
* x1: const int16_t *_src1,
* x2: const int16_t *gradient_h,
* x3: const int16_t *gradient_v,
* x4: int16_t vx[16],
* x5: int16_t vy[16],
* w6: int block_h
*/
function vvc_derive_bdof_vx_vy_8x_neon, export=0
stp d11, d10, [sp, #-0x20]!
stp d9, d8, [sp, #0x10]
movrel x11, bdof_vx_vy_8x_tbl
ldr q0, [x11] // table
mvni v2.4s, #30 // -31, for log2
movi v3.4s, #15 // clip to 15
mvni v4.4s, #14 // clip to -15
mov w11, #0x8
mov w12, w6 // y = block_h
b 4f
1:
// save line4 results
bdof_vx_vy_8x_save_line v5, v6, v7, v16, v17
2:
addp v25.4s, v25.4s, v25.4s
addp v27.4s, v27.4s, v27.4s
addp v26.4s, v26.4s, v26.4s
addp v23.4s, v23.4s, v23.4s
addp v24.4s, v24.4s, v24.4s
clz v28.4s, v25.4s
add v28.4s, v28.4s, v2.4s // log2
shl v26.4s, v26.4s, #0x2
sshl v26.4s, v26.4s, v28.4s
bdof_vx_vy_clip_mask v26, v3, v4, v25, v25
sqxtn v26.4h, v25.4s
st1 {v26.s}[0], [x4], x11
subs x12, x12, #(BDOF_MIN_BLOCK_SIZE)
clz v26.4s, v27.4s
add v26.4s, v26.4s, v2.4s
shl v24.4s, v24.4s, #0x2
mul v23.4s, v25.4s, v23.4s
sshr v23.4s, v23.4s, #0x1
sub v23.4s, v24.4s, v23.4s
sshl v23.4s, v23.4s, v26.4s
bdof_vx_vy_clip_mask v23, v3, v4, v27, v23
sqxtn v23.4h, v23.4s
st1 {v23.s}[0], [x5], x11
b.eq 16f
4:
mov x15, #0x0 // dy, inner loop
movi v25.2d, #0
movi v27.2d, #0
movi v23.2d, #0
movi v26.2d, #0
movi v24.2d, #0
b 8f
5:
// add line(-1) and line0 from previous results
bdof_vx_vy_8x_add_line v18, v19, v20, v21, v22
bdof_vx_vy_8x_add_line v5, v6, v7, v16, v17
add x15, x15, #1
8:
cmp w12, w6
b.hs 9f
// y < block_h && dy == 0, reuse previous results
cbz x15, 5b
9:
ldr q28, [x0] // src0
ldr q29, [x1] // src1
ldr q30, [x2], #(BDOF_BLOCK_SIZE * 2) // (gh0 + gh1) >> 1
ldr q31, [x3], #(BDOF_BLOCK_SIZE * 2) // (gv0 + gv1) >> 1
add x0, x0, #(VVC_MAX_PB_SIZE * 2)
add x1, x1, #(VVC_MAX_PB_SIZE * 2)
sshr v28.8h, v28.8h, #0x4
sshr v29.8h, v29.8h, #0x4
sub v8.8h, v28.8h, v29.8h // diff
abs v28.8h, v30.8h
abs v29.8h, v31.8h
bdof_vx_vy_8x_padding_left_right v28, v9, v10, v28
bdof_vx_vy_8x_padding_left_right v29, v9, v10, v29
bdof_vx_vy_sign v30, v9, v10, v9
bdof_vx_vy_sign v31, v10, v31, v31
mul v30.8h, v31.8h, v30.8h
mul v9.8h, v9.8h, v8.8h
mul v8.8h, v31.8h, v8.8h
bdof_vx_vy_8x_padding_left_right v30, v31, v10, v30
bdof_vx_vy_8x_padding_left_right v9, v31, v10, v31
bdof_vx_vy_8x_padding_left_right v8, v9, v10, v8
bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
cmp w12, w6
b.ne 10f
cbnz x15, 10f
// y == block_h && dy == 0, duplicate first line results
bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
add x15, x15, #0x1
b 9b
10:
cmp x15, #(BDOF_MIN_BLOCK_SIZE - 1)
b.eq 11f
cmp x15, #(BDOF_MIN_BLOCK_SIZE)
b.ne 12f
b 1b
11:
// y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
// duplicate the results and break
cmp x12, #(BDOF_MIN_BLOCK_SIZE)
b.eq 13f
bdof_vx_vy_8x_save_line v18, v19, v20, v21, v22
12:
add x15, x15, #1
b 8b
13:
// y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
// padding bottom then break
bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
b 2b
16:
ldp d9, d8, [sp, #0x10]
ldp d11, d10, [sp], #0x20
ret
endfunc
/*
* x0: const int16_t *_src0,
* x1: const int16_t *_src1,
* x2: const int16_t *gradient_h,
* x3: const int16_t *gradient_v,
* x4: int16_t vx[16],
* x5: int16_t vy[16],
* w6: int block_h
*/
function vvc_derive_bdof_vx_vy_16x_neon, export=0
stp d15, d14, [sp, #-0x40]!
stp d13, d12, [sp, #0x10]
stp d11, d10, [sp, #0x20]
stp d9, d8, [sp, #0x30]
movrel x12, bdof_vx_vy_16x_tbl
ldp q0, q1, [x12] // table
mov w13, w6 // y = block_h
b 4f
1:
// save line4
bdof_vx_vy_16x_save_line v6, v7, v16, v17, v18
2:
clz v3.4s, v25.4s
mvni v5.4s, #0x1e
add v3.4s, v3.4s, v5.4s // -log2()
shl v4.4s, v28.4s, #0x2
sshl v3.4s, v4.4s, v3.4s
movi v28.4s, #0xf // clip to 15
mvni v29.4s, #0xe // clip to -15
bdof_vx_vy_clip_mask v3, v28, v29, v25, v3
sqxtn v4.4h, v3.4s
st1 {v4.d}[0], [x4], #(BDOF_MIN_BLOCK_SIZE * 2)
subs x13, x13, #(BDOF_MIN_BLOCK_SIZE) // y -= BDOF_MIN_BLOCK_SIZE
clz v4.4s, v24.4s
add v4.4s, v4.4s, v5.4s // -log2()
shl v5.4s, v27.4s, #0x2
mul v3.4s, v3.4s, v26.4s
sshr v3.4s, v3.4s, #0x1
sub v3.4s, v5.4s, v3.4s
sshl v3.4s, v3.4s, v4.4s
bdof_vx_vy_clip_mask v3, v28, v29, v24, v3
sqxtn v3.4h, v3.4s
st1 {v3.d}[0], [x5], #(BDOF_MIN_BLOCK_SIZE * 2)
b.eq 16f
4:
mov w14, #0x0 // dy, inner loop
movi v25.2d, #0
movi v24.2d, #0
movi v26.2d, #0
movi v28.2d, #0
movi v27.2d, #0
b 8f
5:
// add line(-1) and line0 from previous results
bdof_vx_vy_16x_add_line v19, v20, v21, v22, v23
bdof_vx_vy_16x_add_line v6, v7, v16, v17, v18
add w14, w14, #0x1
8:
cmp w13, w6
b.hs 9f
// y < block_h && dy == 0, reuse previous results
cbz w14, 5b
9:
ld1 {v29.8h, v30.8h}, [x0] // src0
sshr v31.8h, v29.8h, #0x4
ld1 {v8.8h, v9.8h}, [x1] // src1
sshr v10.8h, v8.8h, #0x4
ldp q13, q8, [x2], #32 // (gh0 + gh1) >> 1
sshr v29.8h, v30.8h, #0x4
sshr v30.8h, v9.8h, #0x4
ldp q5, q3, [x3], #32 // (gv0 + gv1) >> 1
sub v31.8h, v31.8h, v10.8h // diff, left half
sub v4.8h, v29.8h, v30.8h // diff, right half
abs v29.8h, v13.8h
abs v30.8h, v8.8h
abs v9.8h, v5.8h
abs v10.8h, v3.8h
add x0, x0, #(VVC_MAX_PB_SIZE * 2)
add x1, x1, #(VVC_MAX_PB_SIZE * 2)
bdof_vx_vy_16x_padding_left_right v29, v30, v11, v12, v14, v29
bdof_vx_vy_16x_padding_left_right v9, v10, v11, v12, v14, v30
bdof_vx_vy_sign v13, v9, v10, v9
bdof_vx_vy_sign v8, v10, v11, v10
bdof_vx_vy_sign v5, v11, v5, v5
bdof_vx_vy_sign v3, v11, v3, v3
mul v11.8h, v5.8h, v13.8h
mul v12.8h, v3.8h, v8.8h
mul v8.8h, v9.8h, v31.8h
mul v9.8h, v10.8h, v4.8h
mul v13.8h, v5.8h, v31.8h
mul v14.8h, v3.8h, v4.8h
bdof_vx_vy_16x_padding_left_right v11, v12, v3, v4, v5, v31
bdof_vx_vy_16x_padding_left_right v8, v9, v3, v4, v5, v8
bdof_vx_vy_16x_padding_left_right v13, v14, v3, v4, v5, v9
bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
// check whether padding top
cmp w13, w6
b.ne 10f
cbnz w14, 10f
// y == block_h && dy == 0, padding top
bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
add w14, w14, #0x1
b 9b
10:
cmp w14, #(BDOF_MIN_BLOCK_SIZE - 1)
b.eq 11f
cmp w14, #(BDOF_MIN_BLOCK_SIZE)
b.ne 12f
// save line4
b 1b
11:
// y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1, padding bottom
cmp x13, #(BDOF_MIN_BLOCK_SIZE)
b.eq 13f
// save line3
bdof_vx_vy_16x_save_line v19, v20, v21, v22, v23
12:
add w14, w14, #0x1 // dy++
b 8b
13:
// padding bottom
bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
b 2b
16:
// restore
ldp d9, d8, [sp, #0x30]
ldp d11, d10, [sp, #0x20]
ldp d13, d12, [sp, #0x10]
ldp d15, d14, [sp], #0x40
ret
endfunc
function ff_vvc_apply_bdof_10_neon, export=1
mov w6, #10
b 0f
endfunc
function ff_vvc_apply_bdof_12_neon, export=1
mov w6, #12
b 0f
endfunc
// int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
// int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
// int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
#define APPLY_BDOF_STACK_SIZE ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 4)
#define GRADIENT_H0_OFFSET 2
#define GRADIENT_H1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 2 + 2)
#define GRADIENT_V0_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 4 + 2)
#define GRADIENT_V1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 6 + 2)
#define VX_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8)
#define VY_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 2)
function ff_vvc_apply_bdof_8_neon, export=1
mov w6, #8
0:
stp x19, x20, [sp, #-0x40]!
stp x21, x22, [sp, #0x10]
stp x23, x24, [sp, #0x20]
stp x25, x30, [sp, #0x30]
sub sp, sp, #APPLY_BDOF_STACK_SIZE
mov w19, w6 // bit_depth
mov x20, x0 // dst
mov x21, x1 // dst_stride
mov x22, x2 // src0
mov x23, x3 // src1
mov w24, w4 // block_w
mov w25, w5 // block_h
// int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
add x0, sp, #GRADIENT_H0_OFFSET
add x1, sp, #GRADIENT_H1_OFFSET
add x2, sp, #GRADIENT_V0_OFFSET
add x3, sp, #GRADIENT_V1_OFFSET
mov x4, x22
mov x5, x23
mov w6, w24
mov w7, w25
bl vvc_bdof_grad_filter_8x_neon
cmp w24, #8
mov x0, x22 // src0
mov x1, x23 // src1
add x2, sp, #GRADIENT_H0_OFFSET // gh0
add x3, sp, #GRADIENT_V0_OFFSET // gv0
add x4, sp, #VX_OFFSET // vx
add x5, sp, #VY_OFFSET // vy
mov w6, w25 // block_h
b.gt 16f
bl vvc_derive_bdof_vx_vy_8x_neon
cmp w19, #10 // check bitdepth
mov x0, x20 // dst
mov x1, x21 // dst_stride
mov x2, x22 // src0
mov x3, x23 // src1
add x4, sp, #GRADIENT_H1_OFFSET // gh1
add x5, sp, #GRADIENT_V1_OFFSET // gv1
add x6, sp, #VX_OFFSET
add x7, sp, #VY_OFFSET
str w25, [sp]
b.eq 1f
b.gt 2f
// 8bit
0:
bl vvc_apply_bdof_block_8x_8_neon
b 32f
1:
// 10bit
bl vvc_apply_bdof_block_8x_10_neon
b 32f
2:
// 12bit
bl vvc_apply_bdof_block_8x_12_neon
b 32f
16:
bl vvc_derive_bdof_vx_vy_16x_neon
cmp w19, #10 // check bitdepth
mov x0, x20 // dst
mov x1, x21 // dst_stride
mov x2, x22 // src0
mov x3, x23 // src1
add x4, sp, #GRADIENT_H1_OFFSET // gh1
add x5, sp, #GRADIENT_V1_OFFSET // gv1
add x6, sp, #VX_OFFSET
add x7, sp, #VY_OFFSET
str w25, [sp]
b.eq 17f
b.gt 18f
// 8bit
bl vvc_apply_bdof_block_16x_8_neon
b 32f
17:
// 10bit
bl vvc_apply_bdof_block_16x_10_neon
b 32f
18:
// 12bit
bl vvc_apply_bdof_block_16x_12_neon
32:
add sp, sp, #APPLY_BDOF_STACK_SIZE
ldp x25, x30, [sp, #0x30]
ldp x23, x24, [sp, #0x20]
ldp x21, x22, [sp, #0x10]
ldp x19, x20, [sp], #0x40
ret
endfunc
#undef APPLY_BDOF_STACK_SIZE
#undef GRADIENT_H0_OFFSET
#undef GRADIENT_H1_OFFSET
#undef GRADIENT_V0_OFFSET
#undef GRADIENT_V1_OFFSET
#undef VX_OFFSET
#undef VY_OFFSET
#define VVC_MAX_PB_SIZE 128
.macro put_luma_h_x8_vector_filter shift
// 8 bytes from hf loaded to v0.8h
// 32 bytes from _src loaded to v20.8h & v21.8h where v21.8h is loaded for shift to v1.8h,..,v6.8h,v17.8h
// v24.4h & v25.4h are output vectors to store
ext v1.16b, v20.16b, v21.16b, #2
ext v2.16b, v20.16b, v21.16b, #4
ext v3.16b, v20.16b, v21.16b, #6
ext v4.16b, v20.16b, v21.16b, #8
ext v5.16b, v20.16b, v21.16b, #10
ext v6.16b, v20.16b, v21.16b, #12
ext v17.16b, v20.16b, v21.16b, #14
smull v24.4s, v20.4h, v0.h[0]
smull2 v25.4s, v20.8h, v0.h[0]
smlal v24.4s, v1.4h, v0.h[1]
smlal2 v25.4s, v1.8h, v0.h[1]
smlal v24.4s, v2.4h, v0.h[2]
smlal2 v25.4s, v2.8h, v0.h[2]
smlal v24.4s, v3.4h, v0.h[3]
smlal2 v25.4s, v3.8h, v0.h[3]
smlal v24.4s, v4.4h, v0.h[4]
smlal2 v25.4s, v4.8h, v0.h[4]
smlal v24.4s, v5.4h, v0.h[5]
smlal2 v25.4s, v5.8h, v0.h[5]
smlal v24.4s, v6.4h, v0.h[6]
smlal2 v25.4s, v6.8h, v0.h[6]
smlal v24.4s, v17.4h, v0.h[7]
smlal2 v25.4s, v17.8h, v0.h[7]
sqshrn v24.4h, v24.4s, #(\shift)
sqshrn v25.4h, v25.4s, #(\shift)
.endm
.macro put_luma_h8_xx_neon shift
mov x9, #(VVC_MAX_PB_SIZE * 2)
ld1 {v0.8b}, [x4]
sub x1, x1, #6
sxtl v0.8h, v0.8b
1:
ld1 {v20.8h, v21.8h}, [x1], x2
put_luma_h_x8_vector_filter \shift
subs w3, w3, #1
st1 {v24.4h, v25.4h}, [x0], x9
b.gt 1b
ret
.endm
.macro put_luma_h16_xx_neon shift
mov x9, #(VVC_MAX_PB_SIZE * 2)
ld1 {v0.8b}, [x4]
sub x9, x9, #16
sub x1, x1, #6
sxtl v0.8h, v0.8b
1:
ld1 {v20.8h, v21.8h, v22.8h}, [x1], x2
put_luma_h_x8_vector_filter \shift
mov v20.16b, v21.16b
mov v21.16b, v22.16b
st1 {v24.4h, v25.4h}, [x0], #16
put_luma_h_x8_vector_filter \shift
subs w3, w3, #1
st1 {v24.4h, v25.4h}, [x0], x9
b.gt 1b
ret
.endm
.macro put_luma_h_x16_xx_neon shift
mov x9, #(VVC_MAX_PB_SIZE * 2)
ld1 {v0.8b}, [x4]
sub x9, x9, w6, uxtw #1
sub x2, x2, w6, uxtw #1
sxtl v0.8h, v0.8b
sub x1, x1, #6
sub x2, x2, #16
1:
ld1 {v20.8h}, [x1], #16
mov w8, w6
2:
ld1 {v21.8h, v22.8h}, [x1], #32
put_luma_h_x8_vector_filter \shift
mov v20.16b, v21.16b
mov v21.16b, v22.16b
st1 {v24.4h, v25.4h}, [x0], #16
put_luma_h_x8_vector_filter \shift
mov v20.16b, v21.16b
subs w8, w8, #16
st1 {v24.4h, v25.4h}, [x0], #16
b.gt 2b
subs w3, w3, #1
add x0, x0, x9
add x1, x1, x2
b.gt 1b
ret
.endm
function ff_vvc_put_luma_h8_10_neon, export=1
put_luma_h8_xx_neon 2
endfunc
function ff_vvc_put_luma_h8_12_neon, export=1
put_luma_h8_xx_neon 4
endfunc
function ff_vvc_put_luma_h16_10_neon, export=1
put_luma_h16_xx_neon 2
endfunc
function ff_vvc_put_luma_h16_12_neon, export=1
put_luma_h16_xx_neon 4
endfunc
function ff_vvc_put_luma_h_x16_10_neon, export=1
put_luma_h_x16_xx_neon 2
endfunc
function ff_vvc_put_luma_h_x16_12_neon, export=1
put_luma_h_x16_xx_neon 4
endfunc