mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2026-02-13 17:05:43 +01:00
It allows us to easily synchronize the software and hardware decoders, by removing the abstraction the Vulkan layer added by changing the values written.
175 lines
6.5 KiB
Plaintext
175 lines
6.5 KiB
Plaintext
/*
|
|
* ProRes RAW decoder
|
|
*
|
|
* Copyright (c) 2025 Lynne <dev@lynne.ee>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#define COMP_ID (gl_LocalInvocationID.z)
|
|
#define BLOCK_ID (gl_LocalInvocationID.y)
|
|
#define ROW_ID (gl_LocalInvocationID.x)
|
|
|
|
shared float blocks[16][4*64];
|
|
|
|
const ivec2 scan[64] = {
|
|
ivec2( 0, 0), ivec2( 4, 0), ivec2( 0, 2), ivec2( 4, 2),
|
|
ivec2( 0, 8), ivec2( 4, 8), ivec2( 6, 8), ivec2( 2, 10),
|
|
ivec2( 2, 0), ivec2( 6, 0), ivec2( 2, 2), ivec2( 6, 2),
|
|
ivec2( 2, 8), ivec2( 8, 8), ivec2( 0, 10), ivec2( 4, 10),
|
|
ivec2( 8, 0), ivec2(12, 0), ivec2( 8, 2), ivec2(12, 2),
|
|
ivec2(10, 8), ivec2(14, 8), ivec2( 6, 10), ivec2( 2, 12),
|
|
ivec2(10, 0), ivec2(14, 0), ivec2(10, 2), ivec2(14, 2),
|
|
ivec2(12, 8), ivec2( 8, 10), ivec2( 0, 12), ivec2( 4, 12),
|
|
ivec2( 0, 4), ivec2( 4, 4), ivec2( 6, 4), ivec2( 2, 6),
|
|
ivec2(10, 10), ivec2(14, 10), ivec2( 6, 12), ivec2( 2, 14),
|
|
ivec2( 2, 4), ivec2( 8, 4), ivec2( 0, 6), ivec2( 4, 6),
|
|
ivec2(12, 10), ivec2( 8, 12), ivec2( 0, 14), ivec2( 4, 14),
|
|
ivec2(10, 4), ivec2(14, 4), ivec2( 6, 6), ivec2(12, 6),
|
|
ivec2(10, 12), ivec2(14, 12), ivec2( 6, 14), ivec2(12, 14),
|
|
ivec2(12, 4), ivec2( 8, 6), ivec2(10, 6), ivec2(14, 6),
|
|
ivec2(12, 12), ivec2( 8, 14), ivec2(10, 14), ivec2(14, 14),
|
|
};
|
|
|
|
const float idct_scale[64] = {
|
|
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
|
|
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
|
|
0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293,
|
|
0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362,
|
|
0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541,
|
|
0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343,
|
|
0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362,
|
|
0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822,
|
|
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
|
|
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
|
|
0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109,
|
|
0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924,
|
|
0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021,
|
|
0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857,
|
|
0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822,
|
|
0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892,
|
|
};
|
|
|
|
void idct8(uint block, uint offset, uint stride)
|
|
{
|
|
float t0, t1, t2, t3, t4, t5, t6, t7, u8;
|
|
float u0, u1, u2, u3, u4, u5, u6, u7;
|
|
|
|
/* Input */
|
|
t0 = blocks[block][0*stride + offset];
|
|
u4 = blocks[block][1*stride + offset];
|
|
t2 = blocks[block][2*stride + offset];
|
|
u6 = blocks[block][3*stride + offset];
|
|
t1 = blocks[block][4*stride + offset];
|
|
u5 = blocks[block][5*stride + offset];
|
|
t3 = blocks[block][6*stride + offset];
|
|
u7 = blocks[block][7*stride + offset];
|
|
|
|
/* Embedded scaled inverse 4-point Type-II DCT */
|
|
u0 = t0 + t1;
|
|
u1 = t0 - t1;
|
|
u3 = t2 + t3;
|
|
u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
|
|
t0 = u0 + u3;
|
|
t3 = u0 - u3;
|
|
t1 = u1 + u2;
|
|
t2 = u1 - u2;
|
|
|
|
/* Embedded scaled inverse 4-point Type-IV DST */
|
|
t5 = u5 + u6;
|
|
t6 = u5 - u6;
|
|
t7 = u4 + u7;
|
|
t4 = u4 - u7;
|
|
u7 = t7 + t5;
|
|
u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
|
|
u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
|
|
u4 = u8 - t4*(1.0823922002923939687994464107328f);
|
|
u6 = u8 - t6*(2.6131259297527530557132863468544f);
|
|
t7 = u7;
|
|
t6 = t7 - u6;
|
|
t5 = t6 + u5;
|
|
t4 = t5 - u4;
|
|
|
|
/* Butterflies */
|
|
u0 = t0 + t7;
|
|
u7 = t0 - t7;
|
|
u6 = t1 + t6;
|
|
u1 = t1 - t6;
|
|
u2 = t2 + t5;
|
|
u5 = t2 - t5;
|
|
u4 = t3 + t4;
|
|
u3 = t3 - t4;
|
|
|
|
/* Output */
|
|
blocks[block][0*stride + offset] = u0;
|
|
blocks[block][1*stride + offset] = u1;
|
|
blocks[block][2*stride + offset] = u2;
|
|
blocks[block][3*stride + offset] = u3;
|
|
blocks[block][4*stride + offset] = u4;
|
|
blocks[block][5*stride + offset] = u5;
|
|
blocks[block][6*stride + offset] = u6;
|
|
blocks[block][7*stride + offset] = u7;
|
|
}
|
|
|
|
void main(void)
|
|
{
|
|
const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
|
|
TileData td = tile_data[tile_idx];
|
|
|
|
if (expectEXT(td.pos.x >= frame_size.x, false))
|
|
return;
|
|
|
|
uint64_t pkt_offset = uint64_t(pkt_data) + td.offset;
|
|
u8vec2buf hdr_data = u8vec2buf(pkt_offset);
|
|
int qscale = pack16(hdr_data[0].v.yx);
|
|
|
|
const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1);
|
|
const uint w = min(tile_size.x, frame_size.x - td.pos.x) / 2;
|
|
const uint nb_blocks = w / 8;
|
|
|
|
/* We have to do non-uniform access, so copy it */
|
|
uint8_t qmat_buf[64] = qmat;
|
|
|
|
[[unroll]]
|
|
for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x) {
|
|
int v = int(imageLoad(dst, offs + 2*ivec2(BLOCK_ID*8, 0) + scan[i])[0]);
|
|
float vf = float(sign_extend(v, 16)) / 32768.0;
|
|
vf *= qmat_buf[i] * qscale;
|
|
blocks[BLOCK_ID][COMP_ID*64 + i] = (vf / (64*4.56)) *
|
|
idct_scale[i];
|
|
}
|
|
|
|
barrier();
|
|
idct8(BLOCK_ID, COMP_ID*64 + ROW_ID*8, 1);
|
|
|
|
blocks[BLOCK_ID][COMP_ID*64 + ROW_ID] += 0.5;
|
|
|
|
barrier();
|
|
idct8(BLOCK_ID, COMP_ID*64 + ROW_ID, 8);
|
|
|
|
barrier();
|
|
[[unroll]]
|
|
for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x) {
|
|
int v = int(round(blocks[BLOCK_ID][COMP_ID*64 + i]*4095.0));
|
|
v = clamp(v, 0, 4095);
|
|
v <<= 4;
|
|
imageStore(dst,
|
|
offs + 2*ivec2(BLOCK_ID*8 + (i & 7), i >> 3),
|
|
ivec4(v));
|
|
}
|
|
}
|