Files
ffmpeg/libavcodec/vulkan/prores_idct.comp
averne 1c5bb1b12d vulkan/prores: normalize coefficients during IDCT
This allows increased internal precision.
In addition, we can introduce an offset to the DC coefficient
during the second IDCT step, to remove a per-element addition
in the output codepath.
Finally, by processing columns first we can remove the barrier
after loading coefficients.

Signed-off-by: averne <averne381@gmail.com>
2025-11-29 17:56:28 +01:00

177 lines
6.3 KiB
Plaintext

/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* Two macroblocks, padded to avoid bank conflicts */
shared float blocks[4*2][8*(8+1)];
uint get_px(uint tex_idx, ivec2 pos)
{
#ifndef INTERLACED
return imageLoad(dst[tex_idx], pos).x;
#else
return imageLoad(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field)).x;
#endif
}
void put_px(uint tex_idx, ivec2 pos, uint v)
{
#ifndef INTERLACED
imageStore(dst[tex_idx], pos, uvec4(v));
#else
imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(v));
#endif
}
const float idct_scale[64] = {
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293,
0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362,
0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541,
0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343,
0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362,
0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822,
0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199,
0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679,
0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109,
0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924,
0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021,
0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857,
0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822,
0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892,
};
/* 7.4 Inverse Transform */
void idct8(uint block, uint offset, uint stride)
{
float t0, t1, t2, t3, t4, t5, t6, t7, u8;
float u0, u1, u2, u3, u4, u5, u6, u7;
/* Input */
t0 = blocks[block][0*stride + offset];
u4 = blocks[block][1*stride + offset];
t2 = blocks[block][2*stride + offset];
u6 = blocks[block][3*stride + offset];
t1 = blocks[block][4*stride + offset];
u5 = blocks[block][5*stride + offset];
t3 = blocks[block][6*stride + offset];
u7 = blocks[block][7*stride + offset];
/* Embedded scaled inverse 4-point Type-II DCT */
u0 = t0 + t1;
u1 = t0 - t1;
u3 = t2 + t3;
u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
t0 = u0 + u3;
t3 = u0 - u3;
t1 = u1 + u2;
t2 = u1 - u2;
/* Embedded scaled inverse 4-point Type-IV DST */
t5 = u5 + u6;
t6 = u5 - u6;
t7 = u4 + u7;
t4 = u4 - u7;
u7 = t7 + t5;
u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
u4 = u8 - t4*(1.0823922002923939687994464107328f);
u6 = u8 - t6*(2.6131259297527530557132863468544f);
t7 = u7;
t6 = t7 - u6;
t5 = t6 + u5;
t4 = t5 - u4;
/* Butterflies */
u0 = t0 + t7;
u7 = t0 - t7;
u6 = t1 + t6;
u1 = t1 - t6;
u2 = t2 + t5;
u5 = t2 - t5;
u4 = t3 + t4;
u3 = t3 - t4;
/* Output */
blocks[block][0*stride + offset] = u0;
blocks[block][1*stride + offset] = u1;
blocks[block][2*stride + offset] = u2;
blocks[block][3*stride + offset] = u3;
blocks[block][4*stride + offset] = u4;
blocks[block][5*stride + offset] = u5;
blocks[block][6*stride + offset] = u6;
blocks[block][7*stride + offset] = u7;
}
void main(void)
{
uvec3 gid = gl_GlobalInvocationID, lid = gl_LocalInvocationID;
uint comp = gid.z, block = (lid.y << 2) | (lid.x >> 3), idx = lid.x & 0x7;
uint chroma_shift = comp != 0 ? log2_chroma_w : 0;
bool act = gid.x < mb_width << (4 - chroma_shift);
/**
* Normalize coefficients to [-1, 1] for increased precision during the iDCT.
* DCT coeffs have the range of a 12-bit signed integer (7.4 Inverse Transform).
*/
const float norm = 1.0f / (1 << 11);
/* Coalesced load of DCT coeffs in shared memory, inverse quantization */
if (act) {
/**
* According to the VK spec indexing an array in push constant memory with
* a non-dynamically uniform value is illegal ($15.9.1 in v1.4.326),
* so copy the whole matrix locally.
*/
uint8_t[64] qmat = comp == 0 ? qmat_luma : qmat_chroma;
/* Table 15 */
uint8_t qidx = quant_idx[(gid.y >> 1) * mb_width + (gid.x >> (4 - chroma_shift))];
int qscale = qidx > 128 ? (qidx - 96) << 2 : qidx;
[[unroll]] for (uint i = 0; i < 8; ++i) {
uint cidx = (i << 3) + idx;
int c = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) + i))), 16);
float v = float(c * qscale * int(qmat[cidx])) * norm;
blocks[block][i * 9 + idx] = v * idct_scale[cidx];
}
}
/* Column-wise iDCT */
idct8(block, idx, 9);
barrier();
/* Remap [-1, 1] to [0, 2] to remove a per-element addition in the output loop */
blocks[block][idx * 9] += 1.0f;
/* Row-wise iDCT */
idct8(block, idx * 9, 1);
barrier();
float fact = 1 << (depth - 1);
int maxv = (1 << depth) - 1;
/* 7.5.1 Color Component Samples. Rescale, clamp and write back to global memory */
if (act) {
[[unroll]] for (uint i = 0; i < 8; ++i) {
float v = round(blocks[block][i * 9 + idx] * fact);
put_px(comp, ivec2(gid.x, (gid.y << 3) + i), clamp(int(v), 0, maxv));
}
}
}