Files
ffmpeg/libavcodec/vulkan/prores_raw_idct.comp
Lynne 7af5b5cec3 vulkan_prores_raw: use the native image representation
It allows us to easily synchronize the software and hardware
decoders, by removing the abstraction the Vulkan layer added by changing
the values written.
2025-11-26 15:16:42 +01:00

175 lines
6.5 KiB
Plaintext

/*
* ProRes RAW decoder
*
* Copyright (c) 2025 Lynne <dev@lynne.ee>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define COMP_ID (gl_LocalInvocationID.z)
#define BLOCK_ID (gl_LocalInvocationID.y)
#define ROW_ID (gl_LocalInvocationID.x)
shared float blocks[16][4*64];
const ivec2 scan[64] = {
ivec2( 0, 0), ivec2( 4, 0), ivec2( 0, 2), ivec2( 4, 2),
ivec2( 0, 8), ivec2( 4, 8), ivec2( 6, 8), ivec2( 2, 10),
ivec2( 2, 0), ivec2( 6, 0), ivec2( 2, 2), ivec2( 6, 2),
ivec2( 2, 8), ivec2( 8, 8), ivec2( 0, 10), ivec2( 4, 10),
ivec2( 8, 0), ivec2(12, 0), ivec2( 8, 2), ivec2(12, 2),
ivec2(10, 8), ivec2(14, 8), ivec2( 6, 10), ivec2( 2, 12),
ivec2(10, 0), ivec2(14, 0), ivec2(10, 2), ivec2(14, 2),
ivec2(12, 8), ivec2( 8, 10), ivec2( 0, 12), ivec2( 4, 12),
ivec2( 0, 4), ivec2( 4, 4), ivec2( 6, 4), ivec2( 2, 6),
ivec2(10, 10), ivec2(14, 10), ivec2( 6, 12), ivec2( 2, 14),
ivec2( 2, 4), ivec2( 8, 4), ivec2( 0, 6), ivec2( 4, 6),
ivec2(12, 10), ivec2( 8, 12), ivec2( 0, 14), ivec2( 4, 14),
ivec2(10, 4), ivec2(14, 4), ivec2( 6, 6), ivec2(12, 6),
ivec2(10, 12), ivec2(14, 12), ivec2( 6, 14), ivec2(12, 14),
ivec2(12, 4), ivec2( 8, 6), ivec2(10, 6), ivec2(14, 6),
ivec2(12, 12), ivec2( 8, 14), ivec2(10, 14), ivec2(14, 14),
};
const float idct_scale[64] = {
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293,
0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362,
0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541,
0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343,
0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362,
0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822,
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109,
0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924,
0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021,
0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857,
0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822,
0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892,
};
void idct8(uint block, uint offset, uint stride)
{
float t0, t1, t2, t3, t4, t5, t6, t7, u8;
float u0, u1, u2, u3, u4, u5, u6, u7;
/* Input */
t0 = blocks[block][0*stride + offset];
u4 = blocks[block][1*stride + offset];
t2 = blocks[block][2*stride + offset];
u6 = blocks[block][3*stride + offset];
t1 = blocks[block][4*stride + offset];
u5 = blocks[block][5*stride + offset];
t3 = blocks[block][6*stride + offset];
u7 = blocks[block][7*stride + offset];
/* Embedded scaled inverse 4-point Type-II DCT */
u0 = t0 + t1;
u1 = t0 - t1;
u3 = t2 + t3;
u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
t0 = u0 + u3;
t3 = u0 - u3;
t1 = u1 + u2;
t2 = u1 - u2;
/* Embedded scaled inverse 4-point Type-IV DST */
t5 = u5 + u6;
t6 = u5 - u6;
t7 = u4 + u7;
t4 = u4 - u7;
u7 = t7 + t5;
u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
u4 = u8 - t4*(1.0823922002923939687994464107328f);
u6 = u8 - t6*(2.6131259297527530557132863468544f);
t7 = u7;
t6 = t7 - u6;
t5 = t6 + u5;
t4 = t5 - u4;
/* Butterflies */
u0 = t0 + t7;
u7 = t0 - t7;
u6 = t1 + t6;
u1 = t1 - t6;
u2 = t2 + t5;
u5 = t2 - t5;
u4 = t3 + t4;
u3 = t3 - t4;
/* Output */
blocks[block][0*stride + offset] = u0;
blocks[block][1*stride + offset] = u1;
blocks[block][2*stride + offset] = u2;
blocks[block][3*stride + offset] = u3;
blocks[block][4*stride + offset] = u4;
blocks[block][5*stride + offset] = u5;
blocks[block][6*stride + offset] = u6;
blocks[block][7*stride + offset] = u7;
}
void main(void)
{
const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
TileData td = tile_data[tile_idx];
if (expectEXT(td.pos.x >= frame_size.x, false))
return;
uint64_t pkt_offset = uint64_t(pkt_data) + td.offset;
u8vec2buf hdr_data = u8vec2buf(pkt_offset);
int qscale = pack16(hdr_data[0].v.yx);
const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1);
const uint w = min(tile_size.x, frame_size.x - td.pos.x) / 2;
const uint nb_blocks = w / 8;
/* We have to do non-uniform access, so copy it */
uint8_t qmat_buf[64] = qmat;
[[unroll]]
for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x) {
int v = int(imageLoad(dst, offs + 2*ivec2(BLOCK_ID*8, 0) + scan[i])[0]);
float vf = float(sign_extend(v, 16)) / 32768.0;
vf *= qmat_buf[i] * qscale;
blocks[BLOCK_ID][COMP_ID*64 + i] = (vf / (64*4.56)) *
idct_scale[i];
}
barrier();
idct8(BLOCK_ID, COMP_ID*64 + ROW_ID*8, 1);
blocks[BLOCK_ID][COMP_ID*64 + ROW_ID] += 0.5;
barrier();
idct8(BLOCK_ID, COMP_ID*64 + ROW_ID, 8);
barrier();
[[unroll]]
for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x) {
int v = int(round(blocks[BLOCK_ID][COMP_ID*64 + i]*4095.0));
v = clamp(v, 0, 4095);
v <<= 4;
imageStore(dst,
offs + 2*ivec2(BLOCK_ID*8 + (i & 7), i >> 3),
ivec4(v));
}
}