mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2026-02-14 10:25:41 +01:00
RPi4 (auto-vectorisation is turned on) put_luma_h_10_4x4_c: 282.8 ( 1.00x) put_luma_h_10_8x8_c: 1069.5 ( 1.00x) put_luma_h_10_8x8_neon: 207.5 ( 5.15x) put_luma_h_10_16x16_c: 1999.6 ( 1.00x) put_luma_h_10_16x16_neon: 777.5 ( 2.57x) put_luma_h_10_32x32_c: 6612.9 ( 1.00x) put_luma_h_10_32x32_neon: 3201.6 ( 2.07x) put_luma_h_10_64x64_c: 25059.0 ( 1.00x) put_luma_h_10_64x64_neon: 13623.5 ( 1.84x) put_luma_h_10_128x128_c: 91310.1 ( 1.00x) put_luma_h_10_128x128_neon: 50358.3 ( 1.81x) put_luma_h_12_4x4_c: 282.1 ( 1.00x) put_luma_h_12_8x8_c: 1068.4 ( 1.00x) put_luma_h_12_8x8_neon: 207.7 ( 5.14x) put_luma_h_12_16x16_c: 1998.0 ( 1.00x) put_luma_h_12_16x16_neon: 777.5 ( 2.57x) put_luma_h_12_32x32_c: 6612.0 ( 1.00x) put_luma_h_12_32x32_neon: 3201.6 ( 2.07x) put_luma_h_12_64x64_c: 25036.8 ( 1.00x) put_luma_h_12_64x64_neon: 13595.1 ( 1.84x) put_luma_h_12_128x128_c: 91305.8 ( 1.00x) put_luma_h_12_128x128_neon: 50359.7 ( 1.81x) Apple M2 Air (auto-vectorisation is turned on) put_luma_h_10_4x4_c: 0.3 ( 1.00x) put_luma_h_10_8x8_c: 1.0 ( 1.00x) put_luma_h_10_8x8_neon: 0.4 ( 2.59x) put_luma_h_10_16x16_c: 2.9 ( 1.00x) put_luma_h_10_16x16_neon: 1.4 ( 2.01x) put_luma_h_10_32x32_c: 9.4 ( 1.00x) put_luma_h_10_32x32_neon: 5.8 ( 1.62x) put_luma_h_10_64x64_c: 35.6 ( 1.00x) put_luma_h_10_64x64_neon: 23.6 ( 1.51x) put_luma_h_10_128x128_c: 131.1 ( 1.00x) put_luma_h_10_128x128_neon: 92.6 ( 1.42x) put_luma_h_12_4x4_c: 0.3 ( 1.00x) put_luma_h_12_8x8_c: 1.0 ( 1.00x) put_luma_h_12_8x8_neon: 0.4 ( 2.58x) put_luma_h_12_16x16_c: 2.9 ( 1.00x) put_luma_h_12_16x16_neon: 1.4 ( 2.00x) put_luma_h_12_32x32_c: 9.4 ( 1.00x) put_luma_h_12_32x32_neon: 5.8 ( 1.61x) put_luma_h_12_64x64_c: 35.3 ( 1.00x) put_luma_h_12_64x64_neon: 23.3 ( 1.52x) put_luma_h_12_128x128_c: 131.2 ( 1.00x) put_luma_h_12_128x128_neon: 92.4 ( 1.42x)
1835 lines
62 KiB
ArmAsm
1835 lines
62 KiB
ArmAsm
/*
|
|
* Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavutil/aarch64/asm.S"
|
|
|
|
#define VVC_MAX_PB_SIZE 128
|
|
#define BDOF_BLOCK_SIZE 16
|
|
#define BDOF_MIN_BLOCK_SIZE 4
|
|
|
|
.macro vvc_w_avg bit_depth
|
|
|
|
.macro vvc_w_avg_\bit_depth\()_2_4 tap
|
|
.if \tap == 2
|
|
ldr s0, [src0]
|
|
ldr s2, [src1]
|
|
.else
|
|
ldr d0, [src0]
|
|
ldr d2, [src1]
|
|
.endif
|
|
mov v4.16b, v16.16b
|
|
smlal v4.4s, v0.4h, v19.4h
|
|
smlal v4.4s, v2.4h, v20.4h
|
|
sqshl v4.4s, v4.4s, v22.4s
|
|
sqxtun v4.4h, v4.4s
|
|
|
|
.if \bit_depth == 8
|
|
sqxtun v4.8b, v4.8h
|
|
.if \tap == 2
|
|
str h4, [dst]
|
|
.else // tap == 4
|
|
str s4, [dst]
|
|
.endif
|
|
|
|
.else // bit_depth > 8
|
|
umin v4.4h, v4.4h, v17.4h
|
|
.if \tap == 2
|
|
str s4, [dst]
|
|
.else
|
|
str d4, [dst]
|
|
.endif
|
|
.endif
|
|
add src0, src0, x10
|
|
add src1, src1, x10
|
|
add dst, dst, dst_stride
|
|
.endm
|
|
|
|
function ff_vvc_w_avg_\bit_depth\()_neon, export=1
|
|
dst .req x0
|
|
dst_stride .req x1
|
|
src0 .req x2
|
|
src1 .req x3
|
|
width .req w4
|
|
height .req w5
|
|
|
|
mov x10, #(VVC_MAX_PB_SIZE * 2)
|
|
cmp width, #8
|
|
lsr x11, x6, #32 // weight0
|
|
mov w12, w6 // weight1
|
|
lsr x13, x7, #32 // offset
|
|
mov w14, w7 // shift
|
|
|
|
dup v19.8h, w11
|
|
neg w14, w14 // so we can use sqshl
|
|
dup v20.8h, w12
|
|
dup v16.4s, w13
|
|
dup v22.4s, w14
|
|
|
|
.if \bit_depth >= 10
|
|
// clip pixel
|
|
mov w6, #((1 << \bit_depth) - 1)
|
|
dup v17.8h, w6
|
|
.endif
|
|
|
|
b.eq 8f
|
|
b.hi 16f
|
|
cmp width, #4
|
|
b.eq 4f
|
|
2: // width == 2
|
|
subs height, height, #1
|
|
vvc_w_avg_\bit_depth\()_2_4 2
|
|
b.ne 2b
|
|
b 32f
|
|
4: // width == 4
|
|
subs height, height, #1
|
|
vvc_w_avg_\bit_depth\()_2_4 4
|
|
b.ne 4b
|
|
b 32f
|
|
8: // width == 8
|
|
ld1 {v0.8h}, [src0], x10
|
|
ld1 {v2.8h}, [src1], x10
|
|
mov v4.16b, v16.16b
|
|
mov v5.16b, v16.16b
|
|
smlal v4.4s, v0.4h, v19.4h
|
|
smlal v4.4s, v2.4h, v20.4h
|
|
smlal2 v5.4s, v0.8h, v19.8h
|
|
smlal2 v5.4s, v2.8h, v20.8h
|
|
sqshl v4.4s, v4.4s, v22.4s
|
|
sqshl v5.4s, v5.4s, v22.4s
|
|
sqxtun v4.4h, v4.4s
|
|
sqxtun2 v4.8h, v5.4s
|
|
subs height, height, #1
|
|
.if \bit_depth == 8
|
|
sqxtun v4.8b, v4.8h
|
|
st1 {v4.8b}, [dst], dst_stride
|
|
.else
|
|
umin v4.8h, v4.8h, v17.8h
|
|
st1 {v4.8h}, [dst], dst_stride
|
|
.endif
|
|
b.ne 8b
|
|
b 32f
|
|
16: // width >= 16
|
|
mov w6, width
|
|
mov x7, src0
|
|
mov x8, src1
|
|
mov x9, dst
|
|
17:
|
|
ldp q0, q1, [x7], #32
|
|
ldp q2, q3, [x8], #32
|
|
mov v4.16b, v16.16b
|
|
mov v5.16b, v16.16b
|
|
mov v6.16b, v16.16b
|
|
mov v7.16b, v16.16b
|
|
smlal v4.4s, v0.4h, v19.4h
|
|
smlal v4.4s, v2.4h, v20.4h
|
|
smlal2 v5.4s, v0.8h, v19.8h
|
|
smlal2 v5.4s, v2.8h, v20.8h
|
|
smlal v6.4s, v1.4h, v19.4h
|
|
smlal v6.4s, v3.4h, v20.4h
|
|
smlal2 v7.4s, v1.8h, v19.8h
|
|
smlal2 v7.4s, v3.8h, v20.8h
|
|
sqshl v4.4s, v4.4s, v22.4s
|
|
sqshl v5.4s, v5.4s, v22.4s
|
|
sqshl v6.4s, v6.4s, v22.4s
|
|
sqshl v7.4s, v7.4s, v22.4s
|
|
sqxtun v4.4h, v4.4s
|
|
sqxtun v6.4h, v6.4s
|
|
sqxtun2 v4.8h, v5.4s
|
|
sqxtun2 v6.8h, v7.4s
|
|
subs w6, w6, #16
|
|
.if \bit_depth == 8
|
|
sqxtun v4.8b, v4.8h
|
|
sqxtun2 v4.16b, v6.8h
|
|
str q4, [x9], #16
|
|
.else
|
|
umin v4.8h, v4.8h, v17.8h
|
|
umin v6.8h, v6.8h, v17.8h
|
|
stp q4, q6, [x9], #32
|
|
.endif
|
|
b.ne 17b
|
|
|
|
subs height, height, #1
|
|
add src0, src0, x10
|
|
add src1, src1, x10
|
|
add dst, dst, dst_stride
|
|
b.ne 16b
|
|
32:
|
|
ret
|
|
|
|
.unreq dst
|
|
.unreq dst_stride
|
|
.unreq src0
|
|
.unreq src1
|
|
.unreq width
|
|
.unreq height
|
|
endfunc
|
|
.endm
|
|
|
|
vvc_w_avg 8
|
|
vvc_w_avg 10
|
|
vvc_w_avg 12
|
|
|
|
.macro vvc_avg bit_depth
|
|
function ff_vvc_avg_\bit_depth\()_neon, export=1
|
|
mov x10, #(VVC_MAX_PB_SIZE * 2)
|
|
movi v16.8h, #0
|
|
movi v17.16b, #255
|
|
ushr v17.8h, v17.8h, #(16 - \bit_depth)
|
|
|
|
cmp w4, #8
|
|
b.gt 16f
|
|
b.eq 8f
|
|
cmp w4, #4
|
|
b.eq 4f
|
|
|
|
2: // width == 2
|
|
ldr s0, [x2]
|
|
subs w5, w5, #1
|
|
ldr s1, [x3]
|
|
.if \bit_depth == 8
|
|
shadd v0.4h, v0.4h, v1.4h
|
|
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
|
|
str h0, [x0]
|
|
.else
|
|
shadd v0.4h, v0.4h, v1.4h
|
|
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
|
|
smax v0.4h, v0.4h, v16.4h
|
|
smin v0.4h, v0.4h, v17.4h
|
|
str s0, [x0]
|
|
.endif
|
|
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
|
|
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
|
|
add x0, x0, x1
|
|
b.ne 2b
|
|
ret
|
|
|
|
4: // width == 4
|
|
ldr d0, [x2]
|
|
subs w5, w5, #1
|
|
ldr d1, [x3]
|
|
.if \bit_depth == 8
|
|
shadd v0.4h, v0.4h, v1.4h
|
|
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
|
|
str s0, [x0]
|
|
.else
|
|
shadd v0.4h, v0.4h, v1.4h
|
|
srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
|
|
smax v0.4h, v0.4h, v16.4h
|
|
smin v0.4h, v0.4h, v17.4h
|
|
str d0, [x0]
|
|
.endif
|
|
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
|
|
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
|
|
add x0, x0, x1
|
|
b.ne 4b
|
|
ret
|
|
|
|
8: // width == 8
|
|
ldr q0, [x2]
|
|
subs w5, w5, #1
|
|
ldr q1, [x3]
|
|
.if \bit_depth == 8
|
|
shadd v0.8h, v0.8h, v1.8h
|
|
sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
|
|
str d0, [x0]
|
|
.else
|
|
shadd v0.8h, v0.8h, v1.8h
|
|
srshr v0.8h, v0.8h, #(15 - 1 - \bit_depth)
|
|
smax v0.8h, v0.8h, v16.8h
|
|
smin v0.8h, v0.8h, v17.8h
|
|
str q0, [x0]
|
|
.endif
|
|
add x2, x2, #(VVC_MAX_PB_SIZE * 2)
|
|
add x3, x3, #(VVC_MAX_PB_SIZE * 2)
|
|
add x0, x0, x1
|
|
b.ne 8b
|
|
ret
|
|
|
|
16: // width >= 16
|
|
.if \bit_depth == 8
|
|
sub x1, x1, w4, sxtw
|
|
.else
|
|
sub x1, x1, w4, sxtw #1
|
|
.endif
|
|
sub x10, x10, w4, sxtw #1
|
|
3:
|
|
mov w6, w4 // width
|
|
1:
|
|
ldp q0, q1, [x2], #32
|
|
subs w6, w6, #16
|
|
ldp q2, q3, [x3], #32
|
|
.if \bit_depth == 8
|
|
shadd v4.8h, v0.8h, v2.8h
|
|
shadd v5.8h, v1.8h, v3.8h
|
|
sqrshrun v0.8b, v4.8h, #6
|
|
sqrshrun2 v0.16b, v5.8h, #6
|
|
st1 {v0.16b}, [x0], #16
|
|
.else
|
|
shadd v4.8h, v0.8h, v2.8h
|
|
shadd v5.8h, v1.8h, v3.8h
|
|
srshr v0.8h, v4.8h, #(15 - 1 - \bit_depth)
|
|
srshr v1.8h, v5.8h, #(15 - 1 - \bit_depth)
|
|
smax v0.8h, v0.8h, v16.8h
|
|
smax v1.8h, v1.8h, v16.8h
|
|
smin v0.8h, v0.8h, v17.8h
|
|
smin v1.8h, v1.8h, v17.8h
|
|
stp q0, q1, [x0], #32
|
|
.endif
|
|
b.ne 1b
|
|
|
|
subs w5, w5, #1
|
|
add x2, x2, x10
|
|
add x3, x3, x10
|
|
add x0, x0, x1
|
|
b.ne 3b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
vvc_avg 8
|
|
vvc_avg 10
|
|
vvc_avg 12
|
|
|
|
/* x0: int16_t *dst
|
|
* x1: const uint8_t *_src
|
|
* x2: ptrdiff_t _src_stride
|
|
* w3: int height
|
|
* x4: intptr_t mx
|
|
* x5: intptr_t my
|
|
* w6: int width
|
|
*/
|
|
function ff_vvc_dmvr_8_neon, export=1
|
|
dst .req x0
|
|
src .req x1
|
|
src_stride .req x2
|
|
height .req w3
|
|
mx .req x4
|
|
my .req x5
|
|
width .req w6
|
|
|
|
sxtw x6, w6
|
|
mov x7, #(VVC_MAX_PB_SIZE * 2 + 8)
|
|
cmp width, #16
|
|
sub src_stride, src_stride, x6
|
|
cset w15, gt // width > 16
|
|
movi v16.8h, #2 // DMVR_SHIFT
|
|
sub x7, x7, x6, lsl #1
|
|
1:
|
|
cbz w15, 2f
|
|
ldr q0, [src], #16
|
|
ushll v1.8h, v0.8b, #2
|
|
ushll2 v2.8h, v0.16b, #2
|
|
stp q1, q2, [dst], #32
|
|
b 3f
|
|
2:
|
|
ldr d0, [src], #8
|
|
ushll v1.8h, v0.8b, #2
|
|
str q1, [dst], #16
|
|
3:
|
|
subs height, height, #1
|
|
ldr s3, [src], #4
|
|
ushll v4.8h, v3.8b, #2
|
|
st1 {v4.4h}, [dst], x7
|
|
|
|
add src, src, src_stride
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vvc_dmvr_12_neon, export=1
|
|
sxtw x6, w6
|
|
mov x7, #(VVC_MAX_PB_SIZE * 2 + 8)
|
|
cmp width, #16
|
|
sub src_stride, src_stride, x6, lsl #1
|
|
cset w15, gt // width > 16
|
|
sub x7, x7, x6, lsl #1
|
|
1:
|
|
cbz w15, 2f
|
|
ldp q0, q1, [src], #32
|
|
urshr v0.8h, v0.8h, #2
|
|
urshr v1.8h, v1.8h, #2
|
|
|
|
stp q0, q1, [dst], #32
|
|
b 3f
|
|
2:
|
|
ldr q0, [src], #16
|
|
urshr v0.8h, v0.8h, #2
|
|
str q0, [dst], #16
|
|
3:
|
|
subs height, height, #1
|
|
ldr d0, [src], #8
|
|
urshr v0.4h, v0.4h, #2
|
|
st1 {v0.4h}, [dst], x7
|
|
|
|
add src, src, src_stride
|
|
b.ne 1b
|
|
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vvc_dmvr_v_8_neon, export=1
|
|
movrel x7, X(ff_vvc_inter_luma_dmvr_filters)
|
|
add x7, x7, x5, lsl #1
|
|
ld2r {v0.16b, v1.16b}, [x7]
|
|
tbz w6, #4, 12f
|
|
|
|
ldr s16, [x1, #16]
|
|
ld1 {v2.16b}, [x1], x2
|
|
20:
|
|
ldr s17, [x1, #16]
|
|
umull v4.8h, v0.8b, v2.8b
|
|
umull2 v5.8h, v0.16b, v2.16b
|
|
ld1 {v3.16b}, [x1], x2
|
|
umull v16.8h, v0.8b, v16.8b
|
|
umull v6.8h, v1.8b, v3.8b
|
|
umull2 v7.8h, v1.16b, v3.16b
|
|
add v4.8h, v4.8h, v6.8h
|
|
umull v18.8h, v1.8b, v17.8b
|
|
add v5.8h, v5.8h, v7.8h
|
|
urshr v4.8h, v4.8h, #2
|
|
add v19.4h, v16.4h, v18.4h
|
|
urshr v5.8h, v5.8h, #2
|
|
urshr v19.4h, v19.4h, #2
|
|
st1 {v4.8h, v5.8h}, [x0], #32
|
|
subs w3, w3, #1
|
|
mov v2.16b, v3.16b
|
|
st1 {v19.4h}, [x0], #8
|
|
mov v16.16b, v17.16b
|
|
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
|
|
b.ne 20b
|
|
ret
|
|
|
|
12:
|
|
ldr s16, [x1, #8]
|
|
ld1 {v2.8b}, [x1], x2
|
|
2:
|
|
ldr s17, [x1, #8]
|
|
umull v4.8h, v0.8b, v2.8b
|
|
ld1 {v3.8b}, [x1], x2
|
|
umull v16.8h, v0.8b, v16.8b
|
|
umull v6.8h, v1.8b, v3.8b
|
|
add v4.8h, v4.8h, v6.8h
|
|
umull v18.8h, v1.8b, v17.8b
|
|
srshr v4.8h, v4.8h, #2
|
|
add v19.4h, v16.4h, v18.4h
|
|
srshr v19.4h, v19.4h, #2
|
|
st1 {v4.8h}, [x0], #16
|
|
subs w3, w3, #1
|
|
mov v2.16b, v3.16b
|
|
st1 {v19.4h}, [x0], #8
|
|
mov v16.16b, v17.16b
|
|
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
|
|
b.ne 2b
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vvc_dmvr_h_8_neon, export=1
|
|
movrel x7, X(ff_vvc_inter_luma_dmvr_filters)
|
|
add x7, x7, x4, lsl #1
|
|
ld2r {v0.16b, v1.16b}, [x7]
|
|
tbz w6, #4, 12f
|
|
20:
|
|
ldur q3, [x1, #1]
|
|
ldr q2, [x1]
|
|
umull v4.8h, v0.8b, v2.8b
|
|
umull2 v5.8h, v0.16b, v2.16b
|
|
ldur s17, [x1, #17]
|
|
umull v6.8h, v1.8b, v3.8b
|
|
ldr s16, [x1, #16]
|
|
umull2 v7.8h, v1.16b, v3.16b
|
|
add v4.8h, v4.8h, v6.8h
|
|
umull v17.8h, v1.8b, v17.8b
|
|
add v5.8h, v5.8h, v7.8h
|
|
umull v16.8h, v0.8b, v16.8b
|
|
srshr v4.8h, v4.8h, #2
|
|
add v16.4h, v16.4h, v17.4h
|
|
srshr v5.8h, v5.8h, #2
|
|
srshr v16.4h, v16.4h, #2
|
|
st1 {v4.8h, v5.8h}, [x0], #32
|
|
subs w3, w3, #1
|
|
st1 {v16.4h}, [x0], #8
|
|
add x1, x1, x2
|
|
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
|
|
b.ne 20b
|
|
ret
|
|
|
|
12:
|
|
ldur d3, [x1, #1]
|
|
ldr d2, [x1]
|
|
umull v4.8h, v0.8b, v2.8b
|
|
ldur s17, [x1, #9]
|
|
umull v6.8h, v1.8b, v3.8b
|
|
ldr s16, [x1, #8]
|
|
add v4.8h, v4.8h, v6.8h
|
|
umull v17.8h, v1.8b, v17.8b
|
|
umull v16.8h, v0.8b, v16.8b
|
|
srshr v4.8h, v4.8h, #2
|
|
add v16.4h, v16.4h, v17.4h
|
|
srshr v16.4h, v16.4h, #2
|
|
st1 {v4.8h}, [x0], #16
|
|
subs w3, w3, #1
|
|
st1 {v16.4h}, [x0], #8
|
|
add x1, x1, x2
|
|
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
|
|
b.ne 12b
|
|
ret
|
|
endfunc
|
|
|
|
.macro vvc_dmvr_h_10 bit_depth
|
|
function ff_vvc_dmvr_h_\bit_depth\()_neon, export=1
|
|
movrel x7, X(ff_vvc_inter_luma_dmvr_filters)
|
|
add x7, x7, x4, lsl #1
|
|
ld2r {v0.16b, v1.16b}, [x7]
|
|
uxtl v0.8h, v0.8b
|
|
uxtl v1.8h, v1.8b
|
|
tbz w6, #4, 12f
|
|
20:
|
|
ldur q3, [x1, #2]
|
|
ldr q2, [x1]
|
|
ldr q22, [x1, #16]
|
|
mul v4.8h, v0.8h, v2.8h
|
|
mul v6.8h, v1.8h, v3.8h
|
|
ldur q23, [x1, #18]
|
|
mul v5.8h, v0.8h, v22.8h
|
|
ldur d17, [x1, #34]
|
|
mul v7.8h, v1.8h, v23.8h
|
|
uhadd v4.8h, v4.8h, v6.8h
|
|
ldr d16, [x1, #32]
|
|
uhadd v5.8h, v5.8h, v7.8h
|
|
mul v17.4h, v1.4h, v17.4h
|
|
mul v16.4h, v0.4h, v16.4h
|
|
urshr v4.8h, v4.8h, #(\bit_depth - 6 - 1)
|
|
urshr v5.8h, v5.8h, #(\bit_depth - 6 - 1)
|
|
uhadd v16.4h, v16.4h, v17.4h
|
|
urshr v16.4h, v16.4h, #(\bit_depth - 6 - 1)
|
|
st1 {v4.8h, v5.8h}, [x0], #32
|
|
subs w3, w3, #1
|
|
st1 {v16.4h}, [x0], #8
|
|
add x1, x1, x2
|
|
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
|
|
b.ne 20b
|
|
ret
|
|
|
|
12:
|
|
ldur q3, [x1, #2]
|
|
ldr q2, [x1]
|
|
mul v4.8h, v0.8h, v2.8h
|
|
ldur d17, [x1, #18]
|
|
mul v6.8h, v1.8h, v3.8h
|
|
ldr d16, [x1, #16]
|
|
uhadd v4.8h, v4.8h, v6.8h
|
|
mul v17.4h, v1.4h, v17.4h
|
|
mul v16.4h, v0.4h, v16.4h
|
|
urshr v4.8h, v4.8h, #(\bit_depth - 6 - 1)
|
|
uhadd v16.4h, v16.4h, v17.4h
|
|
urshr v16.4h, v16.4h, #(\bit_depth - 6 - 1)
|
|
st1 {v4.8h}, [x0], #16
|
|
subs w3, w3, #1
|
|
st1 {v16.4h}, [x0], #8
|
|
add x1, x1, x2
|
|
add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
|
|
b.ne 12b
|
|
ret
|
|
endfunc
|
|
.endm
|
|
|
|
vvc_dmvr_h_10 10
|
|
vvc_dmvr_h_10 12
|
|
|
|
function ff_vvc_dmvr_hv_8_neon, export=1
|
|
tmp0 .req x7
|
|
tmp1 .req x8
|
|
|
|
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
|
|
|
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
|
|
add x12, x9, mx, lsl #1
|
|
mov tmp0, sp
|
|
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
|
|
// We know the value are positive
|
|
ld2r {v0.16b, v1.16b}, [x12]
|
|
|
|
add x12, x9, my, lsl #1
|
|
ldrb w10, [x12]
|
|
ldrb w11, [x12, #1]
|
|
sxtw x6, w6
|
|
dup v2.8h, w10 // filter_y[0]
|
|
dup v3.8h, w11 // filter_y[1]
|
|
|
|
// Valid value for width can only be 8 + 4, 16 + 4
|
|
cmp width, #16
|
|
mov w10, #0 // start filter_y or not
|
|
add height, height, #1
|
|
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
|
sub src_stride, src_stride, x6
|
|
cset w15, gt // width > 16
|
|
1:
|
|
mov x12, tmp0
|
|
mov x13, tmp1
|
|
mov x14, dst
|
|
cbz w15, 2f
|
|
|
|
// width > 16
|
|
ldur q5, [src, #1]
|
|
ldr q4, [src], #16
|
|
umull v6.8h, v4.8b, v0.8b
|
|
umull2 v16.8h, v4.16b, v0.16b
|
|
umlal v6.8h, v5.8b, v1.8b
|
|
umlal2 v16.8h, v5.16b, v1.16b
|
|
urshr v6.8h, v6.8h, #(8 - 6)
|
|
urshr v7.8h, v16.8h, #(8 - 6)
|
|
stp q6, q7, [x13], #32
|
|
|
|
cbz w10, 3f
|
|
|
|
ldp q16, q17, [x12], #32
|
|
mul v16.8h, v16.8h, v2.8h
|
|
mul v17.8h, v17.8h, v2.8h
|
|
mla v16.8h, v6.8h, v3.8h
|
|
mla v17.8h, v7.8h, v3.8h
|
|
urshr v16.8h, v16.8h, #4
|
|
urshr v17.8h, v17.8h, #4
|
|
stp q16, q17, [x14], #32
|
|
b 3f
|
|
2:
|
|
// width > 8
|
|
ldur d5, [src, #1]
|
|
ldr d4, [src], #8
|
|
umull v6.8h, v4.8b, v0.8b
|
|
umlal v6.8h, v5.8b, v1.8b
|
|
urshr v6.8h, v6.8h, #(8 - 6)
|
|
str q6, [x13], #16
|
|
|
|
cbz w10, 3f
|
|
|
|
ldr q16, [x12], #16
|
|
mul v16.8h, v16.8h, v2.8h
|
|
mla v16.8h, v6.8h, v3.8h
|
|
urshr v16.8h, v16.8h, #4
|
|
str q16, [x14], #16
|
|
3:
|
|
ldur s5, [src, #1]
|
|
ldr s4, [src], #4
|
|
umull v6.8h, v4.8b, v0.8b
|
|
umlal v6.8h, v5.8b, v1.8b
|
|
urshr v6.4h, v6.4h, #(8 - 6)
|
|
str d6, [x13], #8
|
|
|
|
cbz w10, 4f
|
|
|
|
ldr d16, [x12], #8
|
|
mul v16.4h, v16.4h, v2.4h
|
|
mla v16.4h, v6.4h, v3.4h
|
|
urshr v16.4h, v16.4h, #4
|
|
str d16, [x14], #8
|
|
4:
|
|
subs height, height, #1
|
|
mov w10, #1
|
|
add src, src, src_stride
|
|
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
|
eor tmp0, tmp0, tmp1
|
|
eor tmp1, tmp0, tmp1
|
|
eor tmp0, tmp0, tmp1
|
|
b.ne 1b
|
|
|
|
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vvc_dmvr_hv_12_neon, export=1
|
|
mvni v29.4s, #(12 - 6 - 1)
|
|
b 0f
|
|
endfunc
|
|
|
|
function ff_vvc_dmvr_hv_10_neon, export=1
|
|
mvni v29.4s, #(10 - 6 - 1)
|
|
0:
|
|
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
|
|
|
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
|
|
add x12, x9, mx, lsl #1
|
|
ldrb w10, [x12]
|
|
ldrb w11, [x12, #1]
|
|
mov tmp0, sp
|
|
add tmp1, tmp0, #(VVC_MAX_PB_SIZE * 2)
|
|
// We know the value are positive
|
|
dup v0.8h, w10 // filter_x[0]
|
|
dup v1.8h, w11 // filter_x[1]
|
|
|
|
add x12, x9, my, lsl #1
|
|
ldrb w10, [x12]
|
|
ldrb w11, [x12, #1]
|
|
dup v2.8h, w10 // filter_y[0]
|
|
dup v3.8h, w11 // filter_y[1]
|
|
|
|
// Valid value for width can only be 8 + 4, 16 + 4
|
|
cmp width, #16
|
|
mov w10, #0 // start filter_y or not
|
|
add height, height, #1
|
|
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
|
sub src_stride, src_stride, w6, sxtw #1
|
|
cset w15, gt // width > 16
|
|
1:
|
|
mov x12, tmp0
|
|
mov x13, tmp1
|
|
mov x14, dst
|
|
cbz w15, 2f
|
|
|
|
// width > 16
|
|
add x16, src, #2
|
|
ldp q6, q16, [src], #32
|
|
ldp q7, q17, [x16]
|
|
umull v4.4s, v6.4h, v0.4h
|
|
umull2 v5.4s, v6.8h, v0.8h
|
|
umull v18.4s, v16.4h, v0.4h
|
|
umull2 v19.4s, v16.8h, v0.8h
|
|
umlal v4.4s, v7.4h, v1.4h
|
|
umlal2 v5.4s, v7.8h, v1.8h
|
|
umlal v18.4s, v17.4h, v1.4h
|
|
umlal2 v19.4s, v17.8h, v1.8h
|
|
|
|
urshl v4.4s, v4.4s, v29.4s
|
|
urshl v5.4s, v5.4s, v29.4s
|
|
urshl v18.4s, v18.4s, v29.4s
|
|
urshl v19.4s, v19.4s, v29.4s
|
|
uqxtn v6.4h, v4.4s
|
|
uqxtn2 v6.8h, v5.4s
|
|
uqxtn v7.4h, v18.4s
|
|
uqxtn2 v7.8h, v19.4s
|
|
stp q6, q7, [x13], #32
|
|
|
|
cbz w10, 3f
|
|
|
|
ldp q4, q5, [x12], #32
|
|
umull v17.4s, v4.4h, v2.4h
|
|
umull2 v18.4s, v4.8h, v2.8h
|
|
umull v19.4s, v5.4h, v2.4h
|
|
umull2 v20.4s, v5.8h, v2.8h
|
|
umlal v17.4s, v6.4h, v3.4h
|
|
umlal2 v18.4s, v6.8h, v3.8h
|
|
umlal v19.4s, v7.4h, v3.4h
|
|
umlal2 v20.4s, v7.8h, v3.8h
|
|
uqrshrn v6.4h, v17.4s, #4
|
|
uqrshrn2 v6.8h, v18.4s, #4
|
|
uqrshrn v7.4h, v19.4s, #4
|
|
uqrshrn2 v7.8h, v20.4s, #4
|
|
stp q6, q7, [x14], #32
|
|
b 3f
|
|
2:
|
|
// width > 8
|
|
ldur q7, [src, #2]
|
|
ldr q6, [src], #16
|
|
umull v4.4s, v6.4h, v0.4h
|
|
umull2 v5.4s, v6.8h, v0.8h
|
|
umlal v4.4s, v7.4h, v1.4h
|
|
umlal2 v5.4s, v7.8h, v1.8h
|
|
|
|
urshl v4.4s, v4.4s, v29.4s
|
|
urshl v5.4s, v5.4s, v29.4s
|
|
uqxtn v6.4h, v4.4s
|
|
uqxtn2 v6.8h, v5.4s
|
|
str q6, [x13], #16
|
|
|
|
cbz w10, 3f
|
|
|
|
ldr q16, [x12], #16
|
|
umull v17.4s, v16.4h, v2.4h
|
|
umull2 v18.4s, v16.8h, v2.8h
|
|
umlal v17.4s, v6.4h, v3.4h
|
|
umlal2 v18.4s, v6.8h, v3.8h
|
|
urshr v17.4s, v17.4s, #4
|
|
urshr v18.4s, v18.4s, #4
|
|
uqxtn v16.4h, v17.4s
|
|
uqxtn2 v16.8h, v18.4s
|
|
str q16, [x14], #16
|
|
3:
|
|
ldur d7, [src, #2]
|
|
ldr d6, [src], #8
|
|
umull v4.4s, v7.4h, v1.4h
|
|
umlal v4.4s, v6.4h, v0.4h
|
|
urshl v4.4s, v4.4s, v29.4s
|
|
uqxtn v6.4h, v4.4s
|
|
str d6, [x13], #8
|
|
|
|
cbz w10, 4f
|
|
|
|
ldr d16, [x12], #8
|
|
umull v17.4s, v16.4h, v2.4h
|
|
umlal v17.4s, v6.4h, v3.4h
|
|
urshr v17.4s, v17.4s, #4
|
|
uqxtn v16.4h, v17.4s
|
|
str d16, [x14], #8
|
|
4:
|
|
subs height, height, #1
|
|
mov w10, #1
|
|
add src, src, src_stride
|
|
add dst, dst, #(VVC_MAX_PB_SIZE * 2)
|
|
eor tmp0, tmp0, tmp1
|
|
eor tmp1, tmp0, tmp1
|
|
eor tmp0, tmp0, tmp1
|
|
b.ne 1b
|
|
|
|
add sp, sp, #(VVC_MAX_PB_SIZE * 4)
|
|
ret
|
|
|
|
.unreq dst
|
|
.unreq src
|
|
.unreq src_stride
|
|
.unreq height
|
|
.unreq mx
|
|
.unreq my
|
|
.unreq width
|
|
.unreq tmp0
|
|
.unreq tmp1
|
|
endfunc
|
|
|
|
function ff_vvc_prof_grad_filter_8x_neon, export=1
|
|
gh .req x0
|
|
gv .req x1
|
|
gstride .req x2
|
|
src .req x3
|
|
src_stride .req x4
|
|
width .req w5
|
|
height .req w6
|
|
|
|
lsl src_stride, src_stride, #1
|
|
neg x7, src_stride
|
|
1:
|
|
mov x10, src
|
|
mov w11, width
|
|
mov x12, gh
|
|
mov x13, gv
|
|
2:
|
|
ldur q0, [x10, #2]
|
|
ldur q1, [x10, #-2]
|
|
subs w11, w11, #8
|
|
ldr q2, [x10, src_stride]
|
|
ldr q3, [x10, x7]
|
|
sshr v0.8h, v0.8h, #6
|
|
sshr v1.8h, v1.8h, #6
|
|
sshr v2.8h, v2.8h, #6
|
|
sshr v3.8h, v3.8h, #6
|
|
sub v0.8h, v0.8h, v1.8h
|
|
sub v2.8h, v2.8h, v3.8h
|
|
st1 {v0.8h}, [x12], #16
|
|
st1 {v2.8h}, [x13], #16
|
|
add x10, x10, #16
|
|
b.ne 2b
|
|
|
|
subs height, height, #1
|
|
add gh, gh, gstride, lsl #1
|
|
add gv, gv, gstride, lsl #1
|
|
add src, src, src_stride
|
|
b.ne 1b
|
|
ret
|
|
|
|
.unreq gh
|
|
.unreq gv
|
|
.unreq gstride
|
|
.unreq src
|
|
.unreq src_stride
|
|
.unreq width
|
|
.unreq height
|
|
endfunc
|
|
|
|
function vvc_bdof_grad_filter_8x_neon, export=0
|
|
gh0 .req x0
|
|
gh1 .req x1
|
|
gv0 .req x2
|
|
gv1 .req x3
|
|
src0 .req x4
|
|
src1 .req x5
|
|
width .req w6
|
|
height .req w7
|
|
tbnz w6, #4, 16f
|
|
|
|
8:
|
|
ldur q0, [src0, #2]
|
|
ldur q1, [src0, #-2]
|
|
ldr q2, [src0, #(VVC_MAX_PB_SIZE << 1)]
|
|
ldr q3, [src0, #-(VVC_MAX_PB_SIZE << 1)]
|
|
sshr v0.8h, v0.8h, #6
|
|
sshr v1.8h, v1.8h, #6
|
|
ldur q4, [src1, #2]
|
|
ldur q5, [src1, #-2]
|
|
sshr v2.8h, v2.8h, #6
|
|
sshr v3.8h, v3.8h, #6
|
|
ldr q6, [src1, #(VVC_MAX_PB_SIZE << 1)]
|
|
ldr q7, [src1, #-(VVC_MAX_PB_SIZE << 1)]
|
|
// results of gradient_h0
|
|
sub v0.8h, v0.8h, v1.8h
|
|
// results of gradient_v0
|
|
sub v2.8h, v2.8h, v3.8h
|
|
|
|
sshr v4.8h, v4.8h, #6
|
|
sshr v5.8h, v5.8h, #6
|
|
sshr v6.8h, v6.8h, #6
|
|
sshr v7.8h, v7.8h, #6
|
|
// results of gradient_h1
|
|
sub v4.8h, v4.8h, v5.8h
|
|
// results of gradient_v1
|
|
sub v6.8h, v6.8h, v7.8h
|
|
|
|
// (gradient_h0 + gradient_h1) >> 1
|
|
shadd v1.8h, v0.8h, v4.8h
|
|
// gradient_h0 - gradient_h1
|
|
sub v5.8h, v0.8h, v4.8h
|
|
|
|
// (gradient_v0 + gradient_v1) >> 1
|
|
shadd v3.8h, v2.8h, v6.8h
|
|
// gradient_v0 - gradient_v1
|
|
sub v7.8h, v2.8h, v6.8h
|
|
|
|
st1 {v1.8h}, [gh0]
|
|
st1 {v5.8h}, [gh1]
|
|
st1 {v3.8h}, [gv0]
|
|
st1 {v7.8h}, [gv1]
|
|
|
|
subs height, height, #1
|
|
add gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
|
|
add gv0, gv0, #(BDOF_BLOCK_SIZE << 1)
|
|
add src0, src0, #(VVC_MAX_PB_SIZE << 1)
|
|
add gh1, gh1, #(BDOF_BLOCK_SIZE << 1)
|
|
add gv1, gv1, #(BDOF_BLOCK_SIZE << 1)
|
|
add src1, src1, #(VVC_MAX_PB_SIZE << 1)
|
|
b.ne 8b
|
|
ret
|
|
|
|
16:
|
|
ldur q0, [src0, #2]
|
|
ldur q1, [src0, #18]
|
|
ldur q16, [src0, #-2]
|
|
sshr v0.8h, v0.8h, #6
|
|
ldur q17, [src0, #14]
|
|
sshr v1.8h, v1.8h, #6
|
|
ldp q18, q19, [src0, #-(VVC_MAX_PB_SIZE << 1)]
|
|
sshr v16.8h, v16.8h, #6
|
|
ldp q2, q3, [src0, #(VVC_MAX_PB_SIZE << 1)]!
|
|
ldur q20, [src1, #2]
|
|
sshr v17.8h, v17.8h, #6
|
|
ldur q21, [src1, #18]
|
|
sshr v2.8h, v2.8h, #6
|
|
ldur q22, [src1, #-2]
|
|
sshr v3.8h, v3.8h, #6
|
|
ldur q23, [src1, #14]
|
|
sshr v18.8h, v18.8h, #6
|
|
ldp q26, q27, [src1, #-(VVC_MAX_PB_SIZE << 1)]
|
|
sshr v19.8h, v19.8h, #6
|
|
ldp q24, q25, [src1, #(VVC_MAX_PB_SIZE << 1)]!
|
|
|
|
// results of gradient_h0
|
|
sub v0.8h, v0.8h, v16.8h
|
|
sub v1.8h, v1.8h, v17.8h
|
|
|
|
// results of gradient_v0
|
|
sub v2.8h, v2.8h, v18.8h
|
|
sub v3.8h, v3.8h, v19.8h
|
|
|
|
sshr v20.8h, v20.8h, #6
|
|
sshr v21.8h, v21.8h, #6
|
|
sshr v22.8h, v22.8h, #6
|
|
sshr v23.8h, v23.8h, #6
|
|
|
|
// results of gradient_h1
|
|
sub v20.8h, v20.8h, v22.8h
|
|
sub v21.8h, v21.8h, v23.8h
|
|
|
|
sshr v24.8h, v24.8h, #6
|
|
sshr v25.8h, v25.8h, #6
|
|
|
|
// gradient_h0 - gradient_h1
|
|
sub v22.8h, v0.8h, v20.8h
|
|
sub v23.8h, v1.8h, v21.8h
|
|
|
|
// (gradient_h0 + gradient_h1) >> 1
|
|
shadd v16.8h, v0.8h, v20.8h
|
|
shadd v17.8h, v1.8h, v21.8h
|
|
|
|
st1 {v22.8h, v23.8h}, [gh1], #32
|
|
|
|
sshr v26.8h, v26.8h, #6
|
|
sshr v27.8h, v27.8h, #6
|
|
|
|
st1 {v16.8h, v17.8h}, [gh0], #32
|
|
|
|
// results of gradient_v1
|
|
sub v24.8h, v24.8h, v26.8h
|
|
sub v25.8h, v25.8h, v27.8h
|
|
|
|
// (gradient_v0 + gradient_v1) >> 1
|
|
shadd v18.8h, v2.8h, v24.8h
|
|
shadd v19.8h, v3.8h, v25.8h
|
|
|
|
// gradient_v0 - gradient_v1
|
|
sub v26.8h, v2.8h, v24.8h
|
|
sub v27.8h, v3.8h, v25.8h
|
|
|
|
st1 {v18.8h,v19.8h}, [gv0], #32
|
|
|
|
subs height, height, #1
|
|
st1 {v26.8h,v27.8h}, [gv1], #32
|
|
|
|
b.ne 16b
|
|
ret
|
|
|
|
.unreq gh0
|
|
.unreq gh1
|
|
.unreq gv0
|
|
.unreq gv1
|
|
.unreq src0
|
|
.unreq src1
|
|
.unreq width
|
|
.unreq height
|
|
endfunc
|
|
|
|
.macro vvc_apply_bdof_block_8x bit_depth
|
|
dst .req x0
|
|
dst_stride .req x1
|
|
src0 .req x2
|
|
src1 .req x3
|
|
gh .req x4
|
|
gv .req x5
|
|
vx .req x6
|
|
vy .req x7
|
|
|
|
ldr w8, [sp]
|
|
mov x12, #(BDOF_BLOCK_SIZE * 2)
|
|
mov x14, #(VVC_MAX_PB_SIZE * 2)
|
|
.if \bit_depth >= 10
|
|
// clip pixel
|
|
mov w15, #((1 << \bit_depth) - 1)
|
|
dup v19.8h, w15
|
|
.endif
|
|
|
|
0:
|
|
ldr s0, [vx], #(2 * BDOF_MIN_BLOCK_SIZE)
|
|
ldr s1, [vy], #(2 * BDOF_MIN_BLOCK_SIZE)
|
|
mov w13, #(BDOF_MIN_BLOCK_SIZE)
|
|
1:
|
|
ld1 {v5.8h}, [src0], x14
|
|
ld1 {v6.8h}, [src1], x14
|
|
|
|
saddl v17.4s, v5.4h, v6.4h
|
|
ld1 {v4.8h}, [gv], x12
|
|
saddl2 v16.4s, v5.8h, v6.8h
|
|
ld1 {v2.8h}, [gh], x12
|
|
smlal v17.4s, v4.4h, v1.h[0]
|
|
smlal2 v16.4s, v4.8h, v1.h[1]
|
|
smlal v17.4s, v2.4h, v0.h[0]
|
|
smlal2 v16.4s, v2.8h, v0.h[1]
|
|
|
|
sqrshrun v5.4h, v17.4s, #(15 - \bit_depth)
|
|
sqrshrun2 v5.8h, v16.4s, #(15 - \bit_depth)
|
|
subs w13, w13, #1
|
|
.if \bit_depth == 8
|
|
sqxtun v5.8b, v5.8h
|
|
st1 {v5.8b}, [dst], dst_stride
|
|
.else
|
|
smin v5.8h, v5.8h, v19.8h
|
|
st1 {v5.8h}, [dst], dst_stride
|
|
.endif
|
|
b.ne 1b
|
|
|
|
subs w8, w8, #(BDOF_MIN_BLOCK_SIZE)
|
|
b.ne 0b
|
|
ret
|
|
|
|
.unreq dst
|
|
.unreq dst_stride
|
|
.unreq src0
|
|
.unreq src1
|
|
.unreq gh
|
|
.unreq gv
|
|
.unreq vx
|
|
.unreq vy
|
|
.endm
|
|
|
|
function vvc_apply_bdof_block_8x_8_neon, export=0
|
|
vvc_apply_bdof_block_8x 8
|
|
endfunc
|
|
|
|
function vvc_apply_bdof_block_8x_10_neon, export=0
|
|
vvc_apply_bdof_block_8x 10
|
|
endfunc
|
|
|
|
function vvc_apply_bdof_block_8x_12_neon, export=0
|
|
vvc_apply_bdof_block_8x 12
|
|
endfunc
|
|
|
|
.macro vvc_apply_bdof_block_16x bit_depth
|
|
dst .req x0
|
|
dst_stride .req x1
|
|
src0 .req x2
|
|
src1 .req x3
|
|
gh .req x4
|
|
gv .req x5
|
|
vx .req x6
|
|
vy .req x7
|
|
|
|
ldr w8, [sp]
|
|
movi v7.4s, #(1 << (14 - \bit_depth))
|
|
.if \bit_depth >= 10
|
|
// clip pixel
|
|
mov w15, #((1 << \bit_depth) - 1)
|
|
movi v18.8h, #0
|
|
dup v19.8h, w15
|
|
.endif
|
|
|
|
0:
|
|
ld1r {v0.8h}, [vx], #2
|
|
ld1r {v1.8h}, [vy], #2
|
|
ld1r {v2.8h}, [vx], #2
|
|
ld1r {v3.8h}, [vy], #2
|
|
|
|
mov w13, #(BDOF_MIN_BLOCK_SIZE)
|
|
|
|
ld1r {v20.8h}, [vx], #2
|
|
ld1r {v21.8h}, [vy], #2
|
|
ld1r {v22.8h}, [vx], #2
|
|
ld1r {v23.8h}, [vy], #2
|
|
|
|
ins v0.d[1], v2.d[1]
|
|
ins v1.d[1], v3.d[1]
|
|
ins v20.d[1], v22.d[1]
|
|
ins v21.d[1], v23.d[1]
|
|
1:
|
|
ldp q2, q22, [gh], #(BDOF_BLOCK_SIZE * 2)
|
|
ldp q4, q24, [gv], #(BDOF_BLOCK_SIZE * 2)
|
|
smull v3.4s, v0.4h, v2.4h
|
|
smull2 v16.4s, v0.8h, v2.8h
|
|
smlal v3.4s, v1.4h, v4.4h
|
|
smlal2 v16.4s, v1.8h, v4.8h
|
|
|
|
ldp q5, q25, [src0], #(VVC_MAX_PB_SIZE * 2)
|
|
ldp q6, q26, [src1], #(VVC_MAX_PB_SIZE * 2)
|
|
|
|
smull v23.4s, v20.4h, v22.4h
|
|
smull2 v27.4s, v20.8h, v22.8h
|
|
smlal v23.4s, v21.4h, v24.4h
|
|
smlal2 v27.4s, v21.8h, v24.8h
|
|
|
|
saddl v2.4s, v5.4h, v6.4h
|
|
add v2.4s, v2.4s, v7.4s
|
|
add v2.4s, v2.4s, v3.4s
|
|
saddl2 v4.4s, v5.8h, v6.8h
|
|
add v4.4s, v4.4s, v7.4s
|
|
add v4.4s, v4.4s, v16.4s
|
|
|
|
saddl v22.4s, v25.4h, v26.4h
|
|
add v22.4s, v22.4s, v7.4s
|
|
add v22.4s, v22.4s, v23.4s
|
|
saddl2 v24.4s, v25.8h, v26.8h
|
|
add v24.4s, v24.4s, v7.4s
|
|
add v24.4s, v24.4s, v27.4s
|
|
|
|
sqshrn v5.4h, v2.4s, #(15 - \bit_depth)
|
|
sqshrn2 v5.8h, v4.4s, #(15 - \bit_depth)
|
|
sqshrn v25.4h, v22.4s, #(15 - \bit_depth)
|
|
sqshrn2 v25.8h, v24.4s, #(15 - \bit_depth)
|
|
|
|
subs w13, w13, #1
|
|
.if \bit_depth == 8
|
|
sqxtun v5.8b, v5.8h
|
|
sqxtun2 v5.16b, v25.8h
|
|
str q5, [dst]
|
|
.else
|
|
smin v5.8h, v5.8h, v19.8h
|
|
smax v5.8h, v5.8h, v18.8h
|
|
smin v25.8h, v25.8h, v19.8h
|
|
smax v25.8h, v25.8h, v18.8h
|
|
stp q5, q25, [dst]
|
|
.endif
|
|
add dst, dst, dst_stride
|
|
b.ne 1b
|
|
|
|
subs w8, w8, #(BDOF_MIN_BLOCK_SIZE)
|
|
b.ne 0b
|
|
ret
|
|
|
|
.unreq dst
|
|
.unreq dst_stride
|
|
.unreq src0
|
|
.unreq src1
|
|
.unreq gh
|
|
.unreq gv
|
|
.unreq vx
|
|
.unreq vy
|
|
.endm
|
|
|
|
function vvc_apply_bdof_block_16x_8_neon, export=0
|
|
vvc_apply_bdof_block_16x 8
|
|
endfunc
|
|
|
|
function vvc_apply_bdof_block_16x_10_neon, export=0
|
|
vvc_apply_bdof_block_16x 10
|
|
endfunc
|
|
|
|
function vvc_apply_bdof_block_16x_12_neon, export=0
|
|
vvc_apply_bdof_block_16x 12
|
|
endfunc
|
|
|
|
const bdof_vx_vy_8x_tbl
|
|
.byte 0, 1, 16, 16, 16, 16, 8, 9
|
|
.byte 6, 7, 16, 16, 16, 16, 14, 15
|
|
endconst
|
|
|
|
const bdof_vx_vy_16x_tbl
|
|
.byte 0, 1, 64, 64, 64, 64, 8, 9
|
|
.byte 6, 7, 64, 64, 64, 64, 16, 17
|
|
.byte 14, 15, 64, 64, 64, 64, 24, 25
|
|
.byte 22, 23, 64, 64, 64, 64, 30, 31
|
|
endconst
|
|
|
|
// line(-1), line0, line1, line2, line3, line4
|
|
// line3 and line4 becomes line(-1) and line0 in the next block.
|
|
.macro bdof_vx_vy_8x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
|
|
mov \tmp0\().16b, v28.16b
|
|
mov \tmp1\().16b, v29.16b
|
|
mov \tmp2\().16b, v30.16b
|
|
mov \tmp3\().16b, v31.16b
|
|
mov \tmp4\().16b, v8.16b
|
|
.endm
|
|
|
|
.macro bdof_vx_vy_8x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
|
|
add v25.4s, v25.4s, \tmp0\().4s
|
|
add v27.4s, v27.4s, \tmp1\().4s
|
|
add v23.4s, v23.4s, \tmp2\().4s
|
|
sub v26.4s, v26.4s, \tmp3\().4s
|
|
sub v24.4s, v24.4s, \tmp4\().4s
|
|
.endm
|
|
|
|
.macro bdof_vx_vy_8x_padding_left_right src, tmp0, tmp1, dst
|
|
tbl \tmp0\().16b, { \src\().16b }, v0.16b
|
|
saddl \tmp1\().4s, \tmp0\().4h, \src\().4h
|
|
saddl2 \dst\().4s, \tmp0\().8h, \src\().8h
|
|
addp \dst\().4s, \tmp1\().4s, \dst\().4s
|
|
.endm
|
|
|
|
.macro bdof_vx_vy_sign src, tmp0, tmp1, dst
|
|
cmlt \tmp0\().8h, \src\().8h, #0
|
|
cmgt \tmp1\().8h, \src\().8h, #0
|
|
sub \dst\().8h, \tmp0\().8h, \tmp1\().8h
|
|
.endm
|
|
|
|
.macro bdof_vx_vy_clip_mask src, max, min, mask, dst
|
|
smin \src\().4s, \src\().4s, \max\().4s
|
|
smax \src\().4s, \src\().4s, \min\().4s
|
|
cmgt \mask\().4s, \mask\().4s, #0
|
|
and \dst\().16b, \src\().16b, \mask\().16b
|
|
.endm
|
|
|
|
.macro bdof_vx_vy_16x_save_line tmp0, tmp1, tmp2, tmp3, tmp4
|
|
mov \tmp0\().16b, v29.16b
|
|
mov \tmp1\().16b, v30.16b
|
|
mov \tmp2\().16b, v31.16b
|
|
mov \tmp3\().16b, v8.16b
|
|
mov \tmp4\().16b, v9.16b
|
|
.endm
|
|
|
|
.macro bdof_vx_vy_16x_add_line tmp0, tmp1, tmp2, tmp3, tmp4
|
|
add v25.4s, v25.4s, \tmp0\().4s
|
|
add v24.4s, v24.4s, \tmp1\().4s
|
|
add v26.4s, v26.4s, \tmp2\().4s
|
|
sub v28.4s, v28.4s, \tmp3\().4s
|
|
sub v27.4s, v27.4s, \tmp4\().4s
|
|
.endm
|
|
|
|
.macro bdof_vx_vy_16x_padding_left_right src0, src1, tmp0, tmp1, tmp2, dst
|
|
tbl \tmp0\().16b, {\src0\().16b, \src1\().16b}, v0.16b
|
|
tbl v2.16b, {\src0\().16b, \src1\().16b}, v1.16b
|
|
saddl \tmp1\().4s, \tmp0\().4h, \src0\().4h
|
|
saddl \tmp2\().4s, v2.4h, \src1\().4h
|
|
saddl2 \tmp0\().4s, \tmp0\().8h, \src0\().8h
|
|
saddl2 \dst\().4s, v2.8h, \src1\().8h
|
|
addp \tmp0\().4s, \tmp1\().4s, \tmp0\().4s
|
|
addp \dst\().4s, \tmp2\().4s, \dst\().4s
|
|
addp \dst\().4s, \tmp0\().4s, \dst\().4s
|
|
.endm
|
|
|
|
/*
|
|
* Line tricks:
|
|
* We need 6 lines of information, from 4N-1, 4N, 4N+1 to 4N+4. 4N-1
|
|
* and 4N+0 are processed in the last group, so they can be reused.
|
|
*
|
|
* (4N-1) [xxxxxxxxxxxxx] <--- reuse
|
|
* (4N) [xxxxxxxxxxxxx] <--- reuse
|
|
* (4N+1) [xxxxxxxxxxxxx]
|
|
* (4N+2) [xxxxxxxxxxxxx]
|
|
* (4N+3) [xxxxxxxxxxxxx] ---> save for reuse
|
|
* (4N+4) [xxxxxxxxxxxxx] ---> save for reuse
|
|
*
|
|
* Special case:
|
|
* 1. Line -1 needs to duplicate line 0.
|
|
* 2. Last line +1 needs to duplicate the last line.
|
|
*
|
|
* ---------------------------------------------------------------------
|
|
* Pixel tricks:
|
|
*
|
|
* [C-1, C0, C1, C2, ... C16]
|
|
*
|
|
* For each line, we need to sum parameters for 4 * 6 pixels:
|
|
* - C-1 + C0 + C1 + C2 + C3 + C4
|
|
* - C3 + C4 + C5 + C6 + C7 + C8
|
|
* - C7 + C8 + C9 + C10 + C11 + C12
|
|
* - C11 + C12 + C13 + C14 + C15 + C16
|
|
*
|
|
* C-1 is C0, C16 is C15, so we can do:
|
|
*
|
|
* [C0, C1, C2, C3, | C4, C5, C6, C7, | C8, ... C15]
|
|
* + | + |
|
|
* [C0, 0, 0, C4, | C3, 0, 0, C8, | C7, ... C15]
|
|
*
|
|
* 8x is similar.
|
|
* ----------------------------------------------------------------------
|
|
* x0: const int16_t *_src0,
|
|
* x1: const int16_t *_src1,
|
|
* x2: const int16_t *gradient_h,
|
|
* x3: const int16_t *gradient_v,
|
|
* x4: int16_t vx[16],
|
|
* x5: int16_t vy[16],
|
|
* w6: int block_h
|
|
*/
|
|
function vvc_derive_bdof_vx_vy_8x_neon, export=0
|
|
stp d11, d10, [sp, #-0x20]!
|
|
stp d9, d8, [sp, #0x10]
|
|
|
|
movrel x11, bdof_vx_vy_8x_tbl
|
|
ldr q0, [x11] // table
|
|
mvni v2.4s, #30 // -31, for log2
|
|
movi v3.4s, #15 // clip to 15
|
|
mvni v4.4s, #14 // clip to -15
|
|
|
|
mov w11, #0x8
|
|
mov w12, w6 // y = block_h
|
|
b 4f
|
|
|
|
1:
|
|
// save line4 results
|
|
bdof_vx_vy_8x_save_line v5, v6, v7, v16, v17
|
|
2:
|
|
addp v25.4s, v25.4s, v25.4s
|
|
addp v27.4s, v27.4s, v27.4s
|
|
addp v26.4s, v26.4s, v26.4s
|
|
addp v23.4s, v23.4s, v23.4s
|
|
addp v24.4s, v24.4s, v24.4s
|
|
|
|
clz v28.4s, v25.4s
|
|
add v28.4s, v28.4s, v2.4s // log2
|
|
shl v26.4s, v26.4s, #0x2
|
|
sshl v26.4s, v26.4s, v28.4s
|
|
|
|
bdof_vx_vy_clip_mask v26, v3, v4, v25, v25
|
|
sqxtn v26.4h, v25.4s
|
|
st1 {v26.s}[0], [x4], x11
|
|
|
|
subs x12, x12, #(BDOF_MIN_BLOCK_SIZE)
|
|
|
|
clz v26.4s, v27.4s
|
|
add v26.4s, v26.4s, v2.4s
|
|
shl v24.4s, v24.4s, #0x2
|
|
mul v23.4s, v25.4s, v23.4s
|
|
sshr v23.4s, v23.4s, #0x1
|
|
sub v23.4s, v24.4s, v23.4s
|
|
sshl v23.4s, v23.4s, v26.4s
|
|
|
|
bdof_vx_vy_clip_mask v23, v3, v4, v27, v23
|
|
sqxtn v23.4h, v23.4s
|
|
st1 {v23.s}[0], [x5], x11
|
|
|
|
b.eq 16f
|
|
4:
|
|
mov x15, #0x0 // dy, inner loop
|
|
|
|
movi v25.2d, #0
|
|
movi v27.2d, #0
|
|
movi v23.2d, #0
|
|
movi v26.2d, #0
|
|
movi v24.2d, #0
|
|
b 8f
|
|
|
|
5:
|
|
// add line(-1) and line0 from previous results
|
|
bdof_vx_vy_8x_add_line v18, v19, v20, v21, v22
|
|
bdof_vx_vy_8x_add_line v5, v6, v7, v16, v17
|
|
add x15, x15, #1
|
|
8:
|
|
cmp w12, w6
|
|
b.hs 9f
|
|
// y < block_h && dy == 0, reuse previous results
|
|
cbz x15, 5b
|
|
9:
|
|
ldr q28, [x0] // src0
|
|
ldr q29, [x1] // src1
|
|
ldr q30, [x2], #(BDOF_BLOCK_SIZE * 2) // (gh0 + gh1) >> 1
|
|
ldr q31, [x3], #(BDOF_BLOCK_SIZE * 2) // (gv0 + gv1) >> 1
|
|
add x0, x0, #(VVC_MAX_PB_SIZE * 2)
|
|
add x1, x1, #(VVC_MAX_PB_SIZE * 2)
|
|
|
|
sshr v28.8h, v28.8h, #0x4
|
|
sshr v29.8h, v29.8h, #0x4
|
|
sub v8.8h, v28.8h, v29.8h // diff
|
|
|
|
abs v28.8h, v30.8h
|
|
abs v29.8h, v31.8h
|
|
|
|
bdof_vx_vy_8x_padding_left_right v28, v9, v10, v28
|
|
bdof_vx_vy_8x_padding_left_right v29, v9, v10, v29
|
|
|
|
bdof_vx_vy_sign v30, v9, v10, v9
|
|
bdof_vx_vy_sign v31, v10, v31, v31
|
|
|
|
mul v30.8h, v31.8h, v30.8h
|
|
mul v9.8h, v9.8h, v8.8h
|
|
mul v8.8h, v31.8h, v8.8h
|
|
|
|
bdof_vx_vy_8x_padding_left_right v30, v31, v10, v30
|
|
bdof_vx_vy_8x_padding_left_right v9, v31, v10, v31
|
|
bdof_vx_vy_8x_padding_left_right v8, v9, v10, v8
|
|
|
|
bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
|
|
|
|
cmp w12, w6
|
|
b.ne 10f
|
|
cbnz x15, 10f
|
|
|
|
// y == block_h && dy == 0, duplicate first line results
|
|
bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
|
|
add x15, x15, #0x1
|
|
b 9b
|
|
10:
|
|
cmp x15, #(BDOF_MIN_BLOCK_SIZE - 1)
|
|
b.eq 11f
|
|
cmp x15, #(BDOF_MIN_BLOCK_SIZE)
|
|
b.ne 12f
|
|
b 1b
|
|
11:
|
|
// y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
|
|
// duplicate the results and break
|
|
cmp x12, #(BDOF_MIN_BLOCK_SIZE)
|
|
b.eq 13f
|
|
bdof_vx_vy_8x_save_line v18, v19, v20, v21, v22
|
|
12:
|
|
add x15, x15, #1
|
|
b 8b
|
|
13:
|
|
// y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1
|
|
// padding bottom then break
|
|
bdof_vx_vy_8x_add_line v28, v29, v30, v31, v8
|
|
b 2b
|
|
16:
|
|
ldp d9, d8, [sp, #0x10]
|
|
ldp d11, d10, [sp], #0x20
|
|
ret
|
|
endfunc
|
|
|
|
/*
|
|
* x0: const int16_t *_src0,
|
|
* x1: const int16_t *_src1,
|
|
* x2: const int16_t *gradient_h,
|
|
* x3: const int16_t *gradient_v,
|
|
* x4: int16_t vx[16],
|
|
* x5: int16_t vy[16],
|
|
* w6: int block_h
|
|
*/
|
|
function vvc_derive_bdof_vx_vy_16x_neon, export=0
|
|
stp d15, d14, [sp, #-0x40]!
|
|
stp d13, d12, [sp, #0x10]
|
|
stp d11, d10, [sp, #0x20]
|
|
stp d9, d8, [sp, #0x30]
|
|
|
|
movrel x12, bdof_vx_vy_16x_tbl
|
|
ldp q0, q1, [x12] // table
|
|
mov w13, w6 // y = block_h
|
|
b 4f
|
|
|
|
1:
|
|
// save line4
|
|
bdof_vx_vy_16x_save_line v6, v7, v16, v17, v18
|
|
2:
|
|
clz v3.4s, v25.4s
|
|
mvni v5.4s, #0x1e
|
|
add v3.4s, v3.4s, v5.4s // -log2()
|
|
shl v4.4s, v28.4s, #0x2
|
|
sshl v3.4s, v4.4s, v3.4s
|
|
|
|
movi v28.4s, #0xf // clip to 15
|
|
mvni v29.4s, #0xe // clip to -15
|
|
bdof_vx_vy_clip_mask v3, v28, v29, v25, v3
|
|
sqxtn v4.4h, v3.4s
|
|
st1 {v4.d}[0], [x4], #(BDOF_MIN_BLOCK_SIZE * 2)
|
|
|
|
subs x13, x13, #(BDOF_MIN_BLOCK_SIZE) // y -= BDOF_MIN_BLOCK_SIZE
|
|
|
|
clz v4.4s, v24.4s
|
|
add v4.4s, v4.4s, v5.4s // -log2()
|
|
shl v5.4s, v27.4s, #0x2
|
|
mul v3.4s, v3.4s, v26.4s
|
|
sshr v3.4s, v3.4s, #0x1
|
|
sub v3.4s, v5.4s, v3.4s
|
|
sshl v3.4s, v3.4s, v4.4s
|
|
|
|
bdof_vx_vy_clip_mask v3, v28, v29, v24, v3
|
|
sqxtn v3.4h, v3.4s
|
|
st1 {v3.d}[0], [x5], #(BDOF_MIN_BLOCK_SIZE * 2)
|
|
b.eq 16f
|
|
4:
|
|
mov w14, #0x0 // dy, inner loop
|
|
|
|
movi v25.2d, #0
|
|
movi v24.2d, #0
|
|
movi v26.2d, #0
|
|
movi v28.2d, #0
|
|
movi v27.2d, #0
|
|
b 8f
|
|
|
|
5:
|
|
// add line(-1) and line0 from previous results
|
|
bdof_vx_vy_16x_add_line v19, v20, v21, v22, v23
|
|
bdof_vx_vy_16x_add_line v6, v7, v16, v17, v18
|
|
add w14, w14, #0x1
|
|
|
|
8:
|
|
cmp w13, w6
|
|
b.hs 9f
|
|
// y < block_h && dy == 0, reuse previous results
|
|
cbz w14, 5b
|
|
9:
|
|
ld1 {v29.8h, v30.8h}, [x0] // src0
|
|
sshr v31.8h, v29.8h, #0x4
|
|
ld1 {v8.8h, v9.8h}, [x1] // src1
|
|
sshr v10.8h, v8.8h, #0x4
|
|
ldp q13, q8, [x2], #32 // (gh0 + gh1) >> 1
|
|
sshr v29.8h, v30.8h, #0x4
|
|
sshr v30.8h, v9.8h, #0x4
|
|
ldp q5, q3, [x3], #32 // (gv0 + gv1) >> 1
|
|
sub v31.8h, v31.8h, v10.8h // diff, left half
|
|
sub v4.8h, v29.8h, v30.8h // diff, right half
|
|
|
|
abs v29.8h, v13.8h
|
|
abs v30.8h, v8.8h
|
|
abs v9.8h, v5.8h
|
|
abs v10.8h, v3.8h
|
|
|
|
add x0, x0, #(VVC_MAX_PB_SIZE * 2)
|
|
add x1, x1, #(VVC_MAX_PB_SIZE * 2)
|
|
|
|
bdof_vx_vy_16x_padding_left_right v29, v30, v11, v12, v14, v29
|
|
bdof_vx_vy_16x_padding_left_right v9, v10, v11, v12, v14, v30
|
|
|
|
bdof_vx_vy_sign v13, v9, v10, v9
|
|
bdof_vx_vy_sign v8, v10, v11, v10
|
|
bdof_vx_vy_sign v5, v11, v5, v5
|
|
bdof_vx_vy_sign v3, v11, v3, v3
|
|
|
|
mul v11.8h, v5.8h, v13.8h
|
|
mul v12.8h, v3.8h, v8.8h
|
|
mul v8.8h, v9.8h, v31.8h
|
|
mul v9.8h, v10.8h, v4.8h
|
|
mul v13.8h, v5.8h, v31.8h
|
|
mul v14.8h, v3.8h, v4.8h
|
|
|
|
bdof_vx_vy_16x_padding_left_right v11, v12, v3, v4, v5, v31
|
|
bdof_vx_vy_16x_padding_left_right v8, v9, v3, v4, v5, v8
|
|
bdof_vx_vy_16x_padding_left_right v13, v14, v3, v4, v5, v9
|
|
|
|
bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
|
|
// check whether padding top
|
|
cmp w13, w6
|
|
b.ne 10f
|
|
cbnz w14, 10f
|
|
// y == block_h && dy == 0, padding top
|
|
bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
|
|
add w14, w14, #0x1
|
|
b 9b
|
|
10:
|
|
cmp w14, #(BDOF_MIN_BLOCK_SIZE - 1)
|
|
b.eq 11f
|
|
cmp w14, #(BDOF_MIN_BLOCK_SIZE)
|
|
b.ne 12f
|
|
// save line4
|
|
b 1b
|
|
11:
|
|
// y == BDOF_MIN_BLOCK_SIZE && dy == BDOF_MIN_BLOCK_SIZE - 1, padding bottom
|
|
cmp x13, #(BDOF_MIN_BLOCK_SIZE)
|
|
b.eq 13f
|
|
// save line3
|
|
bdof_vx_vy_16x_save_line v19, v20, v21, v22, v23
|
|
12:
|
|
add w14, w14, #0x1 // dy++
|
|
b 8b
|
|
13:
|
|
// padding bottom
|
|
bdof_vx_vy_16x_add_line v29, v30, v31, v8, v9
|
|
b 2b
|
|
16:
|
|
// restore
|
|
ldp d9, d8, [sp, #0x30]
|
|
ldp d11, d10, [sp, #0x20]
|
|
ldp d13, d12, [sp, #0x10]
|
|
ldp d15, d14, [sp], #0x40
|
|
ret
|
|
endfunc
|
|
|
|
function ff_vvc_apply_bdof_10_neon, export=1
|
|
mov w6, #10
|
|
b 0f
|
|
endfunc
|
|
|
|
function ff_vvc_apply_bdof_12_neon, export=1
|
|
mov w6, #12
|
|
b 0f
|
|
endfunc
|
|
|
|
// int16_t gradient_buf_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
|
|
// int16_t gradient_buf_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2]
|
|
// int16_t vx[BDOF_BLOCK_SIZE], vy[BDOF_BLOCK_SIZE];
|
|
#define APPLY_BDOF_STACK_SIZE ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 4)
|
|
#define GRADIENT_H0_OFFSET 2
|
|
#define GRADIENT_H1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 2 + 2)
|
|
#define GRADIENT_V0_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 4 + 2)
|
|
#define GRADIENT_V1_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 6 + 2)
|
|
#define VX_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8)
|
|
#define VY_OFFSET ((BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE + 2) * 8 + BDOF_BLOCK_SIZE * 2)
|
|
function ff_vvc_apply_bdof_8_neon, export=1
|
|
mov w6, #8
|
|
0:
|
|
stp x19, x20, [sp, #-0x40]!
|
|
stp x21, x22, [sp, #0x10]
|
|
stp x23, x24, [sp, #0x20]
|
|
stp x25, x30, [sp, #0x30]
|
|
|
|
sub sp, sp, #APPLY_BDOF_STACK_SIZE
|
|
mov w19, w6 // bit_depth
|
|
mov x20, x0 // dst
|
|
mov x21, x1 // dst_stride
|
|
mov x22, x2 // src0
|
|
mov x23, x3 // src1
|
|
mov w24, w4 // block_w
|
|
mov w25, w5 // block_h
|
|
|
|
// int16_t *gradient_h[2] = {&gradient_buf_h[0][1], &gradient_buf_h[1][1]};
|
|
add x0, sp, #GRADIENT_H0_OFFSET
|
|
add x1, sp, #GRADIENT_H1_OFFSET
|
|
add x2, sp, #GRADIENT_V0_OFFSET
|
|
add x3, sp, #GRADIENT_V1_OFFSET
|
|
mov x4, x22
|
|
mov x5, x23
|
|
mov w6, w24
|
|
mov w7, w25
|
|
bl vvc_bdof_grad_filter_8x_neon
|
|
|
|
cmp w24, #8
|
|
mov x0, x22 // src0
|
|
mov x1, x23 // src1
|
|
add x2, sp, #GRADIENT_H0_OFFSET // gh0
|
|
add x3, sp, #GRADIENT_V0_OFFSET // gv0
|
|
add x4, sp, #VX_OFFSET // vx
|
|
add x5, sp, #VY_OFFSET // vy
|
|
mov w6, w25 // block_h
|
|
|
|
b.gt 16f
|
|
|
|
bl vvc_derive_bdof_vx_vy_8x_neon
|
|
cmp w19, #10 // check bitdepth
|
|
mov x0, x20 // dst
|
|
mov x1, x21 // dst_stride
|
|
mov x2, x22 // src0
|
|
mov x3, x23 // src1
|
|
add x4, sp, #GRADIENT_H1_OFFSET // gh1
|
|
add x5, sp, #GRADIENT_V1_OFFSET // gv1
|
|
add x6, sp, #VX_OFFSET
|
|
add x7, sp, #VY_OFFSET
|
|
str w25, [sp]
|
|
b.eq 1f
|
|
b.gt 2f
|
|
// 8bit
|
|
0:
|
|
bl vvc_apply_bdof_block_8x_8_neon
|
|
b 32f
|
|
1:
|
|
// 10bit
|
|
bl vvc_apply_bdof_block_8x_10_neon
|
|
b 32f
|
|
2:
|
|
// 12bit
|
|
bl vvc_apply_bdof_block_8x_12_neon
|
|
b 32f
|
|
16:
|
|
bl vvc_derive_bdof_vx_vy_16x_neon
|
|
|
|
cmp w19, #10 // check bitdepth
|
|
mov x0, x20 // dst
|
|
mov x1, x21 // dst_stride
|
|
mov x2, x22 // src0
|
|
mov x3, x23 // src1
|
|
add x4, sp, #GRADIENT_H1_OFFSET // gh1
|
|
add x5, sp, #GRADIENT_V1_OFFSET // gv1
|
|
add x6, sp, #VX_OFFSET
|
|
add x7, sp, #VY_OFFSET
|
|
str w25, [sp]
|
|
b.eq 17f
|
|
b.gt 18f
|
|
// 8bit
|
|
bl vvc_apply_bdof_block_16x_8_neon
|
|
b 32f
|
|
17:
|
|
// 10bit
|
|
bl vvc_apply_bdof_block_16x_10_neon
|
|
b 32f
|
|
18:
|
|
// 12bit
|
|
bl vvc_apply_bdof_block_16x_12_neon
|
|
32:
|
|
add sp, sp, #APPLY_BDOF_STACK_SIZE
|
|
ldp x25, x30, [sp, #0x30]
|
|
ldp x23, x24, [sp, #0x20]
|
|
ldp x21, x22, [sp, #0x10]
|
|
ldp x19, x20, [sp], #0x40
|
|
ret
|
|
endfunc
|
|
|
|
#undef APPLY_BDOF_STACK_SIZE
|
|
#undef GRADIENT_H0_OFFSET
|
|
#undef GRADIENT_H1_OFFSET
|
|
#undef GRADIENT_V0_OFFSET
|
|
#undef GRADIENT_V1_OFFSET
|
|
#undef VX_OFFSET
|
|
#undef VY_OFFSET
|
|
|
|
#define VVC_MAX_PB_SIZE 128
|
|
|
|
.macro put_luma_h_x8_vector_filter shift
|
|
// 8 bytes from hf loaded to v0.8h
|
|
// 32 bytes from _src loaded to v20.8h & v21.8h where v21.8h is loaded for shift to v1.8h,..,v6.8h,v17.8h
|
|
// v24.4h & v25.4h are output vectors to store
|
|
ext v1.16b, v20.16b, v21.16b, #2
|
|
ext v2.16b, v20.16b, v21.16b, #4
|
|
ext v3.16b, v20.16b, v21.16b, #6
|
|
ext v4.16b, v20.16b, v21.16b, #8
|
|
ext v5.16b, v20.16b, v21.16b, #10
|
|
ext v6.16b, v20.16b, v21.16b, #12
|
|
ext v17.16b, v20.16b, v21.16b, #14
|
|
smull v24.4s, v20.4h, v0.h[0]
|
|
smull2 v25.4s, v20.8h, v0.h[0]
|
|
smlal v24.4s, v1.4h, v0.h[1]
|
|
smlal2 v25.4s, v1.8h, v0.h[1]
|
|
smlal v24.4s, v2.4h, v0.h[2]
|
|
smlal2 v25.4s, v2.8h, v0.h[2]
|
|
smlal v24.4s, v3.4h, v0.h[3]
|
|
smlal2 v25.4s, v3.8h, v0.h[3]
|
|
smlal v24.4s, v4.4h, v0.h[4]
|
|
smlal2 v25.4s, v4.8h, v0.h[4]
|
|
smlal v24.4s, v5.4h, v0.h[5]
|
|
smlal2 v25.4s, v5.8h, v0.h[5]
|
|
smlal v24.4s, v6.4h, v0.h[6]
|
|
smlal2 v25.4s, v6.8h, v0.h[6]
|
|
smlal v24.4s, v17.4h, v0.h[7]
|
|
smlal2 v25.4s, v17.8h, v0.h[7]
|
|
sqshrn v24.4h, v24.4s, #(\shift)
|
|
sqshrn v25.4h, v25.4s, #(\shift)
|
|
.endm
|
|
|
|
.macro put_luma_h8_xx_neon shift
|
|
mov x9, #(VVC_MAX_PB_SIZE * 2)
|
|
ld1 {v0.8b}, [x4]
|
|
sub x1, x1, #6
|
|
sxtl v0.8h, v0.8b
|
|
1:
|
|
ld1 {v20.8h, v21.8h}, [x1], x2
|
|
put_luma_h_x8_vector_filter \shift
|
|
subs w3, w3, #1
|
|
st1 {v24.4h, v25.4h}, [x0], x9
|
|
b.gt 1b
|
|
ret
|
|
.endm
|
|
|
|
.macro put_luma_h16_xx_neon shift
|
|
mov x9, #(VVC_MAX_PB_SIZE * 2)
|
|
ld1 {v0.8b}, [x4]
|
|
sub x9, x9, #16
|
|
sub x1, x1, #6
|
|
sxtl v0.8h, v0.8b
|
|
1:
|
|
ld1 {v20.8h, v21.8h, v22.8h}, [x1], x2
|
|
put_luma_h_x8_vector_filter \shift
|
|
mov v20.16b, v21.16b
|
|
mov v21.16b, v22.16b
|
|
st1 {v24.4h, v25.4h}, [x0], #16
|
|
put_luma_h_x8_vector_filter \shift
|
|
subs w3, w3, #1
|
|
st1 {v24.4h, v25.4h}, [x0], x9
|
|
b.gt 1b
|
|
ret
|
|
.endm
|
|
|
|
.macro put_luma_h_x16_xx_neon shift
|
|
mov x9, #(VVC_MAX_PB_SIZE * 2)
|
|
ld1 {v0.8b}, [x4]
|
|
sub x9, x9, w6, uxtw #1
|
|
sub x2, x2, w6, uxtw #1
|
|
sxtl v0.8h, v0.8b
|
|
sub x1, x1, #6
|
|
sub x2, x2, #16
|
|
1:
|
|
ld1 {v20.8h}, [x1], #16
|
|
mov w8, w6
|
|
2:
|
|
ld1 {v21.8h, v22.8h}, [x1], #32
|
|
put_luma_h_x8_vector_filter \shift
|
|
mov v20.16b, v21.16b
|
|
mov v21.16b, v22.16b
|
|
st1 {v24.4h, v25.4h}, [x0], #16
|
|
put_luma_h_x8_vector_filter \shift
|
|
mov v20.16b, v21.16b
|
|
subs w8, w8, #16
|
|
st1 {v24.4h, v25.4h}, [x0], #16
|
|
b.gt 2b
|
|
subs w3, w3, #1
|
|
add x0, x0, x9
|
|
add x1, x1, x2
|
|
b.gt 1b
|
|
ret
|
|
.endm
|
|
|
|
function ff_vvc_put_luma_h8_10_neon, export=1
|
|
put_luma_h8_xx_neon 2
|
|
endfunc
|
|
|
|
function ff_vvc_put_luma_h8_12_neon, export=1
|
|
put_luma_h8_xx_neon 4
|
|
endfunc
|
|
|
|
function ff_vvc_put_luma_h16_10_neon, export=1
|
|
put_luma_h16_xx_neon 2
|
|
endfunc
|
|
|
|
function ff_vvc_put_luma_h16_12_neon, export=1
|
|
put_luma_h16_xx_neon 4
|
|
endfunc
|
|
|
|
function ff_vvc_put_luma_h_x16_10_neon, export=1
|
|
put_luma_h_x16_xx_neon 2
|
|
endfunc
|
|
|
|
function ff_vvc_put_luma_h_x16_12_neon, export=1
|
|
put_luma_h_x16_xx_neon 4
|
|
endfunc
|