ffmpeg/libavcodec/vulkan/ffv1_dec.comp

/*
 * FFv1 codec
 *
 * Copyright (c) 2024 Lynne <dev@lynne.ee>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#ifndef RGB
#define LADDR(p) (p)
#else
#define RGB_LINECACHE 2
#define RGB_LBUF (RGB_LINECACHE - 1)
#define LADDR(p) (ivec2((p).x, ((p).y & RGB_LBUF)))
#endif

#ifdef RGB
ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
{
    const ivec2 yoff_border1 = expectEXT(off.x == 0, false) ? ivec2(1, -1) : ivec2(0, 0);

    /* Thanks to the same coincidence as below, we can skip checking if off == 0, 1 */
    VTYPE3 top  = VTYPE3(TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1, -1) + yoff_border1))[0]),
                         TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(0, -1)))[0]),
                         TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(min(1, sw - off.x - 1), -1)))[0]));

    /* Normally, we'd need to check if off != ivec2(0, 0) here, since otherwise, we must
     * return zero. However, ivec2(-1,  0) + ivec2(1, -1) == ivec2(0, -1), e.g. previous
     * row, 0 offset, same slice, which is zero since we zero out the buffer for RGB */
    TYPE cur = TYPE(imageLoad(dec[p], sp + LADDR(off + ivec2(-1,  0) + yoff_border1))[0]);

    int base = quant_table[quant_table_idx][0][(cur    - top[0]) & MAX_QUANT_TABLE_MASK] +
               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];

    if (expectEXT(extend_lookup[quant_table_idx] > 0, false)) {
        TYPE cur2 = TYPE(0);
        if (expectEXT(off.x > 0, true)) {
            const ivec2 yoff_border2 = expectEXT(off.x == 1, false) ? ivec2(-1, -1) : ivec2(-2, 0);
            cur2 = TYPE(imageLoad(dec[p], sp + LADDR(off + yoff_border2))[0]);
        }
        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];

        /* top-2 became current upon swap */
        TYPE top2 = TYPE(imageLoad(dec[p], sp + LADDR(off))[0]);
        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
    }

    /* context, prediction */
    return ivec2(base, predict(cur, VTYPE2(top)));
}
#else
ivec2 get_pred(ivec2 sp, ivec2 off, int p, int sw, uint8_t quant_table_idx)
{
    const ivec2 yoff_border1 = off.x == 0 ? ivec2(1, -1) : ivec2(0, 0);
    sp += off;

    VTYPE3 top  = VTYPE3(TYPE(0),
                         TYPE(0),
                         TYPE(0));
    if (off.y > 0 && off != ivec2(0, 1))
        top[0] = TYPE(imageLoad(dec[p], sp + ivec2(-1, -1) + yoff_border1)[0]);
    if (off.y > 0) {
        top[1] = TYPE(imageLoad(dec[p], sp + ivec2(0, -1))[0]);
        top[2] = TYPE(imageLoad(dec[p], sp + ivec2(min(1, sw - off.x - 1), -1))[0]);
    }

    TYPE cur = TYPE(0);
    if (off != ivec2(0, 0))
        cur = TYPE(imageLoad(dec[p], sp + ivec2(-1,  0) + yoff_border1)[0]);

    int base = quant_table[quant_table_idx][0][(cur - top[0]) & MAX_QUANT_TABLE_MASK] +
               quant_table[quant_table_idx][1][(top[0] - top[1]) & MAX_QUANT_TABLE_MASK] +
               quant_table[quant_table_idx][2][(top[1] - top[2]) & MAX_QUANT_TABLE_MASK];

    if ((quant_table[quant_table_idx][3][127] != 0) ||
        (quant_table[quant_table_idx][4][127] != 0)) {
        TYPE cur2 = TYPE(0);
        if (off.x > 0 && off != ivec2(1, 0)) {
            const ivec2 yoff_border2 = off.x == 1 ? ivec2(1, -1) : ivec2(0, 0);
            cur2 = TYPE(imageLoad(dec[p], sp + ivec2(-2,  0) + yoff_border2)[0]);
        }
        base += quant_table[quant_table_idx][3][(cur2 - cur) & MAX_QUANT_TABLE_MASK];

        TYPE top2 = TYPE(0);
        if (off.y > 1)
            top2 = TYPE(imageLoad(dec[p], sp + ivec2(0, -2))[0]);
        base += quant_table[quant_table_idx][4][(top2 - top[1]) & MAX_QUANT_TABLE_MASK];
    }

    /* context, prediction */
    return ivec2(base, predict(cur, VTYPE2(top)));
}
#endif

#ifndef GOLOMB
#ifdef CACHED_SYMBOL_READER
shared uint8_t state[CONTEXT_SIZE];
#define READ(c, off) get_rac_direct(c, state[off])
#else
#define READ(c, off) get_rac(c, uint64_t(slice_state) + (state_off + off))
#endif

int get_isymbol(inout RangeCoder c, uint state_off)
{
    if (READ(c, 0))
        return 0;

    uint e = 1;
    for (; e < 33; e++)
        if (!READ(c, min(e, 10)))
            break;

    if (expectEXT(e == 1, false)) {
        return READ(c, 11) ? -1 : 1;
    } else if (expectEXT(e == 33, false)) {
        corrupt = true;
        return 0;
    }

    int a = 1;
    for (uint i = e + 20; i >= 22; i--) {
        a <<= 1;
        a |= int(READ(c, min(i, 31)));
    }

    return READ(c, min(e + 10, 21)) ? -a : a;
}

void decode_line_pcm(inout SliceContext sc, ivec2 sp, int w, int y, int p, int bits)
{
#ifndef RGB
    if (p > 0 && p < 3) {
        w >>= chroma_shift.x;
        sp >>= chroma_shift;
    }
#endif

    for (int x = 0; x < w; x++) {
        uint v = 0;
        for (int i = (bits - 1); i >= 0; i--)
            v |= uint(get_rac_equi(sc.c)) << i;

        imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v));
    }
}

void decode_line(inout SliceContext sc, ivec2 sp, int w,
                 int y, int p, int bits, uint state_off,
                 uint8_t quant_table_idx, const int run_index)
{
#ifndef RGB
    if (p > 0 && p < 3) {
        w >>= chroma_shift.x;
        sp >>= chroma_shift;
    }
#endif

    for (int x = 0; x < w; x++) {
        ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
                            quant_table_idx);

        uint context_off = state_off + CONTEXT_SIZE*abs(pr[0]);
#ifdef CACHED_SYMBOL_READER
        u8buf sb = u8buf(uint64_t(slice_state) + context_off + gl_LocalInvocationID.x);
        state[gl_LocalInvocationID.x] = sb.v;
        barrier();
        if (gl_LocalInvocationID.x == 0) {

#endif

            int diff = get_isymbol(sc.c, context_off);
            if (pr[0] < 0)
                diff = -diff;

            uint v = zero_extend(pr[1] + diff, bits);
            imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v));

#ifdef CACHED_SYMBOL_READER
        }
        sb.v = state[gl_LocalInvocationID.x];
#endif
    }
}

#else /* GOLOMB */

void decode_line(inout SliceContext sc, ivec2 sp, int w,
                 int y, int p, int bits, uint state_off,
                 uint8_t quant_table_idx, inout int run_index)
{
#ifndef RGB
    if (p > 0 && p < 3) {
        w >>= chroma_shift.x;
        sp >>= chroma_shift;
    }
#endif

    int run_count = 0;
    int run_mode  = 0;

    for (int x = 0; x < w; x++) {
        ivec2 pos = sp + ivec2(x, y);
        int diff;
        ivec2 pr = get_pred(sp, ivec2(x, y), p, w,
                            quant_table_idx);

        VlcState sb = VlcState(uint64_t(slice_state) + state_off + VLC_STATE_SIZE*abs(pr[0]));

        if (pr[0] == 0 && run_mode == 0)
            run_mode = 1;

        if (run_mode != 0) {
            if (run_count == 0 && run_mode == 1) {
                int tmp_idx = int(log2_run[run_index]);
                if (get_bit(sc.gb)) {
                    run_count = 1 << tmp_idx;
                    if (x + run_count <= w)
                        run_index++;
                } else {
                    if (tmp_idx != 0) {
                        run_count = int(get_bits(sc.gb, tmp_idx));
                    } else
                        run_count = 0;

                    if (run_index != 0)
                        run_index--;
                    run_mode = 2;
                }
            }

            run_count--;
            if (run_count < 0) {
                run_mode  = 0;
                run_count = 0;
                diff = read_vlc_symbol(sc.gb, sb, bits);
                if (diff >= 0)
                    diff++;
            } else {
                diff = 0;
            }
        } else {
            diff = read_vlc_symbol(sc.gb, sb, bits);
        }

        if (pr[0] < 0)
            diff = -diff;

        uint v = zero_extend(pr[1] + diff, bits);
        imageStore(dec[p], sp + LADDR(ivec2(x, y)), uvec4(v));
    }
}
#endif

#ifdef RGB
ivec4 transform_sample(ivec4 pix, ivec2 rct_coef)
{
    pix.b -= rct_offset;
    pix.r -= rct_offset;
    pix.g -= (pix.b*rct_coef.y + pix.r*rct_coef.x) >> 2;
    pix.b += pix.g;
    pix.r += pix.g;
    return ivec4(pix[fmt_lut[0]], pix[fmt_lut[1]],
                 pix[fmt_lut[2]], pix[fmt_lut[3]]);
}

void writeout_rgb(in SliceContext sc, ivec2 sp, int w, int y, bool apply_rct)
{
    for (uint x = gl_LocalInvocationID.x; x < w; x += gl_WorkGroupSize.x) {
        ivec2 lpos = sp + LADDR(ivec2(x, y));
        ivec2 pos = sc.slice_pos + ivec2(x, y);

        ivec4 pix;
        pix.r = int(imageLoad(dec[2], lpos)[0]);
        pix.g = int(imageLoad(dec[0], lpos)[0]);
        pix.b = int(imageLoad(dec[1], lpos)[0]);
        if (transparency != 0)
            pix.a = int(imageLoad(dec[3], lpos)[0]);

        if (expectEXT(apply_rct, true))
            pix = transform_sample(pix, sc.slice_rct_coef);

        imageStore(dst[0], pos, pix);
        if (planar_rgb != 0) {
            for (int i = 1; i < color_planes; i++)
                imageStore(dst[i], pos, ivec4(pix[i]));
        }
    }
}
#endif

void decode_slice(inout SliceContext sc, const uint slice_idx)
{
    int run_index = 0;
    int w = sc.slice_dim.x;
    ivec2 sp = sc.slice_pos;

#ifndef RGB
    int bits = bits_per_raw_sample;
#else
    int bits = 9;
    if (bits != 8 || sc.slice_coding_mode != 0)
        bits = bits_per_raw_sample + int(sc.slice_coding_mode != 1);

    sp.y = int(gl_WorkGroupID.y)*RGB_LINECACHE;
#endif

    /* PCM coding */
#ifndef GOLOMB
    if (sc.slice_coding_mode == 1) {
        if (gl_LocalInvocationID.x > 0)
            return;
#ifndef RGB
        for (int p = 0; p < planes; p++) {
            int h = sc.slice_dim.y;
            if (p > 0 && p < 3)
                h >>= chroma_shift.y;

            for (int y = 0; y < h; y++)
                decode_line_pcm(sc, sp, w, y, p, bits);
        }
#else
        for (int y = 0; y < sc.slice_dim.y; y++) {
            for (int p = 0; p < color_planes; p++)
                decode_line_pcm(sc, sp, w, y, p, bits);

            writeout_rgb(sc, sp, w, y, false);
        }
#endif
    } else

    /* Arithmetic coding */
#endif
    {
        u8vec4 quant_table_idx = sc.quant_table_idx.xyyz;
        u32vec4 slice_state_off = (slice_idx*codec_planes + uvec4(0, 1, 1, 2))*plane_state_size;

#ifndef RGB
        for (int p = 0; p < planes; p++) {
            int h = sc.slice_dim.y;
            if (p > 0 && p < 3)
                h >>= chroma_shift.y;

            for (int y = 0; y < h; y++)
                decode_line(sc, sp, w, y, p, bits,
                            slice_state_off[p], quant_table_idx[p], run_index);
        }
#else
        for (int y = 0; y < sc.slice_dim.y; y++) {
            for (int p = 0; p < color_planes; p++)
                decode_line(sc, sp, w, y, p, bits,
                            slice_state_off[p], quant_table_idx[p], run_index);

            writeout_rgb(sc, sp, w, y, true);
        }
#endif
    }
}

void main(void)
{
    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
    decode_slice(slice_ctx[slice_idx], slice_idx);
}