diff --git a/libavcodec/ffv1_vulkan.c b/libavcodec/ffv1_vulkan.c index 2a2226016f..73c2b2a7ce 100644 --- a/libavcodec/ffv1_vulkan.c +++ b/libavcodec/ffv1_vulkan.c @@ -41,7 +41,7 @@ void ff_ffv1_vk_set_common_sl(AVCodecContext *avctx, FFV1Context *f, } int bits = desc->comp[0].depth; - SPEC_LIST_ADD(sl, 5, 32, 1 << bits); + SPEC_LIST_ADD(sl, 5, 32, (uint32_t)(1ULL << bits)); SPEC_LIST_ADD(sl, 6, 32, f->colorspace); SPEC_LIST_ADD(sl, 7, 32, f->transparency); SPEC_LIST_ADD(sl, 8, 32, ff_vk_mt_is_np_rgb(sw_format) && diff --git a/libavcodec/vulkan/ffv1_common.glsl b/libavcodec/vulkan/ffv1_common.glsl index 0133911319..8580a0777f 100644 --- a/libavcodec/vulkan/ffv1_common.glsl +++ b/libavcodec/vulkan/ffv1_common.glsl @@ -100,7 +100,7 @@ struct SliceContext { uint slice_coding_mode; bool slice_reset_contexts; - u16vec4 remap_count; + i32vec4 remap_count; /* Decoder-only */ uint remap; @@ -142,20 +142,22 @@ u16vec4 get_slice_bits(in SliceContext sc) #ifndef FLOAT return u16vec4(c_bits, c_bits, c_bits, c_bits); #else - u16vec4 bits = sc.remap_count; + u32vec4 cnt = sc.remap_count; #if defined(ENCODE) if (remap_mode == 0) #elif defined(DECODE) if (sc.remap == 0) #endif - bits = u16vec4(ivec4(rct_offset, rct_offset, rct_offset, rct_offset)); + cnt = u32vec4(uint(rct_offset), uint(rct_offset), + uint(rct_offset), uint(rct_offset)); + u16vec4 bits = u16vec4(cnt); if (sc.slice_coding_mode == 0) { - uint16_t max3 = max(bits[0], max(bits[1], bits[2])); + uint max3 = max(cnt[0], max(cnt[1], cnt[2])); bits = u16vec4(ceil_log2(max3), - ceil_log2(bits[0] + bits[1]), - ceil_log2(bits[0] + bits[2]), - bits[3]); + ceil_log2(cnt[0] + cnt[1]), + ceil_log2(cnt[0] + cnt[2]), + ceil_log2(cnt[3])); } return bits; diff --git a/libavcodec/vulkan/ffv1_dec.comp.glsl b/libavcodec/vulkan/ffv1_dec.comp.glsl index 0877f699f2..82835e8f92 100644 --- a/libavcodec/vulkan/ffv1_dec.comp.glsl +++ b/libavcodec/vulkan/ffv1_dec.comp.glsl @@ -286,9 +286,12 @@ void writeout_rgb(uint slice_idx, in SliceContext sc, ivec2 sp, int w, int y, pix = pix.gbra; vec4 pd; for (int i = 0; i < color_planes; i++) { - uint v = fltmap[slice_idx][i][pix[i] & (rct_offset - 1)]; - float16_t vf = uint16BitsToFloat16(uint16_t(v)); - pd[i] = float(vf); + uint mask = (1u << ceil_log2(sc.remap_count[i])) - 1u; + uint v = fltmap[slice_idx][i][uint(pix[i]) & mask]; + if (c_bits >= 32) + pd[i] = uintBitsToFloat(v); + else + pd[i] = float(uint16BitsToFloat16(uint16_t(v))); } pd = pd.brga; diff --git a/libavcodec/vulkan/ffv1_dec_setup.comp.glsl b/libavcodec/vulkan/ffv1_dec_setup.comp.glsl index 7f9e6a28ae..ff57c57dc3 100644 --- a/libavcodec/vulkan/ffv1_dec_setup.comp.glsl +++ b/libavcodec/vulkan/ffv1_dec_setup.comp.glsl @@ -23,7 +23,7 @@ #pragma shader_stage(compute) #extension GL_GOOGLE_include_directive : require -#define NB_CONTEXTS 2 +#define NB_CONTEXTS 6 #include "common.glsl" #include "ffv1_common.glsl" @@ -63,42 +63,92 @@ uint get_usymbol(const uint ctx_off) return a; } +int get_isymbol(const uint ctx_off) +{ + if (get_rac(rc_state[ctx_off])) + return 0; + + int e = 0; + while (get_rac(rc_state[ctx_off + 1 + min(e, 9)])) // 1..10 + e++; + + int a = 1; + for (int i = e - 1; i >= 0; i--) { + a <<= 1; + a |= int(get_rac(rc_state[ctx_off + 22 + min(i, 9)])); // 22..31 + } + + return get_rac(rc_state[ctx_off + 11 + min(e, 10)]) ? -a : a; +} + +shared int mul[4096 + 1]; + +int decode_current_mul(uint ctx_off, int mul_count, int64_t i) +{ + int ndx = int((i * int64_t(mul_count)) >> 32); + if (mul[ndx] < 0) + mul[ndx] = int(get_usymbol(ctx_off)) & 0x3FFFFFFF; + return mul[ndx]; +} + void decode_remap(uint slice_idx, inout SliceContext sc) { - int end = rct_offset - 1; - int flip = sc.remap == 2 ? (end >> 1) : 0; + uint end = uint(rct_offset - 1); + uint flip_mask = end ^ (end >> 1); + uint flip = sc.remap == 2 ? (end >> 1) : 0; for (int p = 0; p < color_planes; p++) { int j = 0; int lu = 0; [[unroll]] - for (int i = 0; i < NB_CONTEXTS; i++) - rc_state[i] = uint8_t(128); + for (int k = 0; k < NB_CONTEXTS*CONTEXT_SIZE; k++) + rc_state[k] = uint8_t(128); - get_usymbol(0); + int mul_count = int(get_usymbol(0)); + if (mul_count > 4096) { + sc.remap_count[p] = j; + return; + } + for (int mi = 0; mi < mul_count; mi++) + mul[mi] = -1; + mul[mul_count] = 1; [[unroll]] - for (int i = 0; i < NB_CONTEXTS*CONTEXT_SIZE; i++) - rc_state[i] = uint8_t(128); + for (int k = 0; k < NB_CONTEXTS*CONTEXT_SIZE; k++) + rc_state[k] = uint8_t(128); - for (uint i = 0; i <= end; ) { - uint run = get_usymbol(lu*CONTEXT_SIZE); - uint run0 = lu != 0 ? 0 : run; - uint run1 = lu != 0 ? run : 1; + int current_mul = 1; + int64_t i = 0; + while (i <= int64_t(end)) { + uint run = get_usymbol(uint(lu*3 + 0)*CONTEXT_SIZE); + uint run0 = lu != 0 ? 0u : run; + uint run1 = lu != 0 ? run : 1u; - i += run0; - while (run1-- > 0) { - if (i - 1 >= end) + i += int64_t(run0) * int64_t(current_mul); + + while (run1 > 0u) { + run1--; + if (current_mul > 1) { + int delta = get_isymbol(uint(lu*3 + 1)*CONTEXT_SIZE); + if (delta <= -current_mul || delta > current_mul/2) { + sc.remap_count[p] = j; + return; + } + i += int64_t(current_mul - 1 + delta); + } + if (i - 1 >= int64_t(end)) break; - fltmap[slice_idx][p][j++] = i ^ (((i & 0x8000) != 0) ? 0 : flip); + uint iv = uint(i); + fltmap[slice_idx][p][j++] = iv ^ (((iv & flip_mask) != 0u) ? 0u : flip); i++; + current_mul = decode_current_mul(uint(2)*CONTEXT_SIZE, mul_count, i); } - if (lu > 0) - i++; - lu ^= int(run == 0); + if (lu != 0) + i += int64_t(current_mul); + lu ^= int(run == 0u); } - sc.remap_count[p] = uint16_t(j); + sc.remap_count[p] = j; } } diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp.glsl b/libavcodec/vulkan/ffv1_enc_setup.comp.glsl index e1b64a980c..53a8d7f13f 100644 --- a/libavcodec/vulkan/ffv1_enc_setup.comp.glsl +++ b/libavcodec/vulkan/ffv1_enc_setup.comp.glsl @@ -113,7 +113,7 @@ void encode_histogram_remap(uint slice_idx, inout SliceContext sc) if (run != 0) put_usymbol(run, lu*CONTEXT_SIZE); - sc.remap_count[p] = U16(j); + sc.remap_count[p] = int(j); } }