Files
ffmpeg/libswscale/ops_dispatch.c
Niklas Haas 7fb1e0832c swscale/ops_dispatch: move ENOTSUP error to ff_sws_compile_pass()
Or else this might false-positive when we retry compilation after subpass
splitting.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
2026-03-12 21:02:48 +00:00

410 lines
14 KiB
C

/**
* Copyright (C) 2025 Niklas Haas
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/avassert.h"
#include "libavutil/mem.h"
#include "libavutil/mem_internal.h"
#include "ops.h"
#include "ops_internal.h"
#include "ops_dispatch.h"
typedef struct SwsOpPass {
SwsCompiledOp comp;
SwsOpExec exec_base;
int num_blocks;
int tail_off_in;
int tail_off_out;
int tail_size_in;
int tail_size_out;
int planes_in;
int planes_out;
int pixel_bits_in;
int pixel_bits_out;
int idx_in[4];
int idx_out[4];
bool memcpy_in;
bool memcpy_out;
} SwsOpPass;
int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend,
const SwsOpList *ops, SwsCompiledOp *out)
{
SwsOpList *copy;
SwsCompiledOp compiled = {0};
int ret = 0;
copy = ff_sws_op_list_duplicate(ops);
if (!copy)
return AVERROR(ENOMEM);
/* Ensure these are always set during compilation */
ff_sws_op_list_update_comps(copy);
ret = backend->compile(ctx, copy, &compiled);
if (ret < 0) {
int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR;
av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n",
backend->name, av_err2str(ret));
} else {
*out = compiled;
}
ff_sws_op_list_free(&copy);
return ret;
}
int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out)
{
for (int n = 0; ff_sws_op_backends[n]; n++) {
const SwsOpBackend *backend = ff_sws_op_backends[n];
if (ops->src.hw_format != backend->hw_format ||
ops->dst.hw_format != backend->hw_format)
continue;
if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0)
continue;
av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
"block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n",
backend->name, out->block_size, out->over_read, out->over_write,
out->cpu_flags);
ff_sws_op_list_print(ctx, AV_LOG_VERBOSE, AV_LOG_TRACE, ops);
return 0;
}
return AVERROR(ENOTSUP);
}
void ff_sws_compiled_op_unref(SwsCompiledOp *comp)
{
if (comp->free)
comp->free(comp->priv);
*comp = (SwsCompiledOp) {0};
}
static void op_pass_free(void *ptr)
{
SwsOpPass *p = ptr;
if (!p)
return;
ff_sws_compiled_op_unref(&p->comp);
av_free(p);
}
static inline void get_row_data(const SwsOpPass *p, const int y,
const uint8_t *in[4], uint8_t *out[4])
{
const SwsOpExec *base = &p->exec_base;
for (int i = 0; i < p->planes_in; i++)
in[i] = base->in[i] + (y >> base->in_sub_y[i]) * base->in_stride[i];
for (int i = 0; i < p->planes_out; i++)
out[i] = base->out[i] + (y >> base->out_sub_y[i]) * base->out_stride[i];
}
static int op_pass_setup(const SwsFrame *out, const SwsFrame *in,
const SwsPass *pass)
{
const AVPixFmtDescriptor *indesc = av_pix_fmt_desc_get(in->format);
const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format);
SwsOpPass *p = pass->priv;
SwsOpExec *exec = &p->exec_base;
const SwsCompiledOp *comp = &p->comp;
const int block_size = comp->block_size;
p->num_blocks = (pass->width + block_size - 1) / block_size;
/* Set up main loop parameters */
const int aligned_w = p->num_blocks * block_size;
const int safe_width = (p->num_blocks - 1) * block_size;
const int tail_size = pass->width - safe_width;
p->tail_off_in = safe_width * p->pixel_bits_in >> 3;
p->tail_off_out = safe_width * p->pixel_bits_out >> 3;
p->tail_size_in = tail_size * p->pixel_bits_in >> 3;
p->tail_size_out = tail_size * p->pixel_bits_out >> 3;
p->memcpy_in = false;
p->memcpy_out = false;
for (int i = 0; i < p->planes_in; i++) {
const int idx = p->idx_in[i];
const int chroma = idx == 1 || idx == 2;
const int sub_x = chroma ? indesc->log2_chroma_w : 0;
const int sub_y = chroma ? indesc->log2_chroma_h : 0;
const int plane_w = (aligned_w + sub_x) >> sub_x;
const int plane_pad = (comp->over_read + sub_x) >> sub_x;
const int plane_size = plane_w * p->pixel_bits_in >> 3;
if (comp->slice_align)
p->memcpy_in |= plane_size + plane_pad > in->linesize[idx];
exec->in[i] = in->data[idx];
exec->in_stride[i] = in->linesize[idx];
exec->in_sub_y[i] = sub_y;
exec->in_sub_x[i] = sub_x;
}
for (int i = 0; i < p->planes_out; i++) {
const int idx = p->idx_out[i];
const int chroma = idx == 1 || idx == 2;
const int sub_x = chroma ? outdesc->log2_chroma_w : 0;
const int sub_y = chroma ? outdesc->log2_chroma_h : 0;
const int plane_w = (aligned_w + sub_x) >> sub_x;
const int plane_pad = (comp->over_write + sub_x) >> sub_x;
const int plane_size = plane_w * p->pixel_bits_out >> 3;
if (comp->slice_align)
p->memcpy_out |= plane_size + plane_pad > out->linesize[idx];
exec->out[i] = out->data[idx];
exec->out_stride[i] = out->linesize[idx];
exec->out_sub_y[i] = sub_y;
exec->out_sub_x[i] = sub_x;
}
/* Pre-fill pointer bump for the main section only; this value does not
* matter at all for the tail / last row handlers because they only ever
* process a single line */
const int blocks_main = p->num_blocks - p->memcpy_out;
for (int i = 0; i < 4; i++) {
exec->in_bump[i] = exec->in_stride[i] - blocks_main * exec->block_size_in;
exec->out_bump[i] = exec->out_stride[i] - blocks_main * exec->block_size_out;
}
return 0;
}
/* Dispatch kernel over the last column of the image using memcpy */
static av_always_inline void
handle_tail(const SwsOpPass *p, SwsOpExec *exec,
const bool copy_out, const bool copy_in,
int y, const int h)
{
DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])];
const SwsOpExec *base = &p->exec_base;
const SwsCompiledOp *comp = &p->comp;
const int tail_size_in = p->tail_size_in;
const int tail_size_out = p->tail_size_out;
const int bx = p->num_blocks - 1;
const uint8_t *in_data[4];
uint8_t *out_data[4];
get_row_data(p, y, in_data, out_data);
for (int i = 0; i < p->planes_in; i++) {
in_data[i] += p->tail_off_in;
if (copy_in) {
exec->in[i] = (void *) tmp[0][i];
exec->in_stride[i] = sizeof(tmp[0][i]);
} else {
exec->in[i] = in_data[i];
}
}
for (int i = 0; i < p->planes_out; i++) {
out_data[i] += p->tail_off_out;
if (copy_out) {
exec->out[i] = (void *) tmp[1][i];
exec->out_stride[i] = sizeof(tmp[1][i]);
} else {
exec->out[i] = out_data[i];
}
}
for (int y_end = y + h; y < y_end; y++) {
if (copy_in) {
for (int i = 0; i < p->planes_in; i++) {
av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]);
memcpy(tmp[0][i], in_data[i], tail_size_in);
in_data[i] += base->in_stride[i]; /* exec->in_stride was clobbered */
}
}
comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1);
if (copy_out) {
for (int i = 0; i < p->planes_out; i++) {
av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]);
memcpy(out_data[i], tmp[1][i], tail_size_out);
out_data[i] += base->out_stride[i];
}
}
for (int i = 0; i < 4; i++) {
if (!copy_in && exec->in[i])
exec->in[i] += exec->in_stride[i];
if (!copy_out && exec->out[i])
exec->out[i] += exec->out_stride[i];
}
}
}
static void op_pass_run(const SwsFrame *out, const SwsFrame *in, const int y,
const int h, const SwsPass *pass)
{
const SwsOpPass *p = pass->priv;
const SwsCompiledOp *comp = &p->comp;
/* Fill exec metadata for this slice */
DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base;
exec.slice_y = y;
exec.slice_h = h;
/**
* To ensure safety, we need to consider the following:
*
* 1. We can overread the input, unless this is the last line of an
* unpadded buffer. All defined operations can handle arbitrary pixel
* input, so overread of arbitrary data is fine.
*
* 2. We can overwrite the output, as long as we don't write more than the
* amount of pixels that fit into one linesize. So we always need to
* memcpy the last column on the output side if unpadded.
*
* 3. For the last row, we also need to memcpy the remainder of the input,
* to avoid reading past the end of the buffer. Note that since we know
* the run() function is called on stripes of the same buffer, we don't
* need to worry about this for the end of a slice.
*/
const int last_slice = y + h == pass->height;
const bool memcpy_in = last_slice && p->memcpy_in;
const bool memcpy_out = p->memcpy_out;
const int num_blocks = p->num_blocks;
const int blocks_main = num_blocks - memcpy_out;
const int h_main = h - memcpy_in;
/* Handle main section */
get_row_data(p, y, exec.in, exec.out);
comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main);
if (memcpy_in) {
/* Safe part of last row */
get_row_data(p, y + h_main, exec.in, exec.out);
comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h);
}
/* Handle last column via memcpy, takes over `exec` so call these last */
if (memcpy_out)
handle_tail(p, &exec, true, false, y, h_main);
if (memcpy_in)
handle_tail(p, &exec, memcpy_out, true, y + h_main, 1);
}
static int rw_planes(const SwsOp *op)
{
return op->rw.packed ? 1 : op->rw.elems;
}
static int rw_pixel_bits(const SwsOp *op)
{
const int elems = op->rw.packed ? op->rw.elems : 1;
const int size = ff_sws_pixel_type_size(op->type);
const int bits = 8 >> op->rw.frac;
av_assert1(bits >= 1);
return elems * size * bits;
}
static int compile(SwsGraph *graph, const SwsOpList *ops, SwsPass *input,
SwsPass **output)
{
SwsContext *ctx = graph->ctx;
SwsOpPass *p = av_mallocz(sizeof(*p));
if (!p)
return AVERROR(ENOMEM);
int ret = ff_sws_ops_compile(ctx, ops, &p->comp);
if (ret < 0)
goto fail;
const SwsFormat *dst = &ops->dst;
if (p->comp.opaque) {
SwsCompiledOp c = p->comp;
av_free(p);
return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
input, c.slice_align, c.func_opaque,
NULL, c.priv, c.free, output);
}
const SwsOp *read = ff_sws_op_list_input(ops);
const SwsOp *write = ff_sws_op_list_output(ops);
p->planes_in = rw_planes(read);
p->planes_out = rw_planes(write);
p->pixel_bits_in = rw_pixel_bits(read);
p->pixel_bits_out = rw_pixel_bits(write);
p->exec_base = (SwsOpExec) {
.width = dst->width,
.height = dst->height,
.block_size_in = p->comp.block_size * p->pixel_bits_in >> 3,
.block_size_out = p->comp.block_size * p->pixel_bits_out >> 3,
};
for (int i = 0; i < 4; i++) {
p->idx_in[i] = i < p->planes_in ? ops->order_src.in[i] : -1;
p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1;
}
return ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height,
input, p->comp.slice_align, op_pass_run,
op_pass_setup, p, op_pass_free, output);
fail:
op_pass_free(p);
return ret;
}
int ff_sws_compile_pass(SwsGraph *graph, SwsOpList **pops, int flags,
SwsPass *input, SwsPass **output)
{
SwsContext *ctx = graph->ctx;
SwsOpList *ops = *pops;
int ret = 0;
/* Check if the whole operation graph is an end-to-end no-op */
if (ff_sws_op_list_is_noop(ops)) {
*output = input;
goto out;
}
const SwsOp *read = ff_sws_op_list_input(ops);
const SwsOp *write = ff_sws_op_list_output(ops);
if (!read || !write) {
av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read "
"and write, respectively.\n");
ret = AVERROR(EINVAL);
goto out;
}
if (flags & SWS_OP_FLAG_OPTIMIZE) {
ret = ff_sws_op_list_optimize(ops);
if (ret < 0)
goto out;
}
ret = compile(graph, ops, input, output);
out:
if (ret == AVERROR(ENOTSUP)) {
av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n");
ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops);
}
ff_sws_op_list_free(&ops);
*pops = NULL;
return ret;
}