2 * Copyright (c) 2020 Yaroslav Pogrebnyak <yyyaroslav@gmail.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 * Overlay one video on top of another using cuda hardware acceleration
26 #include "libavutil/log.h"
27 #include "libavutil/opt.h"
28 #include "libavutil/pixdesc.h"
29 #include "libavutil/hwcontext.h"
30 #include "libavutil/hwcontext_cuda_internal.h"
31 #include "libavutil/cuda_check.h"
32 #include "libavutil/eval.h"
36 #include "framesync.h"
39 #include "cuda/load_helper.h"
41 #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, ctx->hwctx->internal->cuda_dl, x)
42 #define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
50 static const enum AVPixelFormat supported_main_formats[] = {
56 static const enum AVPixelFormat supported_overlay_formats[] = {
66 VAR_OVERLAY_W, VAR_OW,
67 VAR_OVERLAY_H, VAR_OH,
84 static const char *const var_names[] = {
85 "main_w", "W", ///< width of the main video
86 "main_h", "H", ///< height of the main video
87 "overlay_w", "w", ///< width of the overlay video
88 "overlay_h", "h", ///< height of the overlay video
91 "n", ///< number of frame
93 "pos", ///< position in the file
95 "t", ///< timestamp expressed in seconds
102 typedef struct OverlayCUDAContext {
103 const AVClass *class;
105 enum AVPixelFormat in_format_overlay;
106 enum AVPixelFormat in_format_main;
108 AVBufferRef *hw_device_ctx;
109 AVCUDADeviceContext *hwctx;
122 double var_values[VAR_VARS_NB];
123 char *x_expr, *y_expr;
125 AVExpr *x_pexpr, *y_pexpr;
126 } OverlayCUDAContext;
129 * Helper to find out if provided format is supported by filter
131 static int format_is_supported(const enum AVPixelFormat formats[], enum AVPixelFormat fmt)
133 for (int i = 0; formats[i] != AV_PIX_FMT_NONE; i++)
134 if (formats[i] == fmt)
139 static inline int normalize_xy(double d, int chroma_sub)
143 return (int)d & ~((1 << chroma_sub) - 1);
146 static void eval_expr(AVFilterContext *ctx)
148 OverlayCUDAContext *s = ctx->priv;
150 s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, NULL);
151 s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, NULL);
152 /* necessary if x is expressed from y */
153 s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, NULL);
155 s->x_position = normalize_xy(s->var_values[VAR_X], 1);
157 /* the cuda pixel format is using hwaccel, normalizing y is unnecessary */
158 s->y_position = s->var_values[VAR_Y];
161 static int set_expr(AVExpr **pexpr, const char *expr, const char *option, void *log_ctx)
168 ret = av_expr_parse(pexpr, expr, var_names,
169 NULL, NULL, NULL, NULL, 0, log_ctx);
171 av_log(log_ctx, AV_LOG_ERROR,
172 "Error when evaluating the expression '%s' for %s\n",
183 * Helper checks if we can process main and overlay pixel formats
185 static int formats_match(const enum AVPixelFormat format_main, const enum AVPixelFormat format_overlay) {
186 switch(format_main) {
187 case AV_PIX_FMT_NV12:
188 return format_overlay == AV_PIX_FMT_NV12;
189 case AV_PIX_FMT_YUV420P:
190 return format_overlay == AV_PIX_FMT_YUV420P ||
191 format_overlay == AV_PIX_FMT_YUVA420P;
198 * Call overlay kernell for a plane
200 static int overlay_cuda_call_kernel(
201 OverlayCUDAContext *ctx,
202 int x_position, int y_position,
203 uint8_t* main_data, int main_linesize,
204 int main_width, int main_height,
205 uint8_t* overlay_data, int overlay_linesize,
206 int overlay_width, int overlay_height,
207 uint8_t* alpha_data, int alpha_linesize,
208 int alpha_adj_x, int alpha_adj_y) {
210 CudaFunctions *cu = ctx->hwctx->internal->cuda_dl;
212 void* kernel_args[] = {
213 &x_position, &y_position,
214 &main_data, &main_linesize,
215 &overlay_data, &overlay_linesize,
216 &overlay_width, &overlay_height,
217 &alpha_data, &alpha_linesize,
218 &alpha_adj_x, &alpha_adj_y,
221 return CHECK_CU(cu->cuLaunchKernel(
223 DIV_UP(main_width, BLOCK_X), DIV_UP(main_height, BLOCK_Y), 1,
225 0, ctx->cu_stream, kernel_args, NULL));
229 * Perform blend overlay picture over main picture
231 static int overlay_cuda_blend(FFFrameSync *fs)
235 AVFilterContext *avctx = fs->parent;
236 OverlayCUDAContext *ctx = avctx->priv;
237 AVFilterLink *outlink = avctx->outputs[0];
238 AVFilterLink *inlink = avctx->inputs[0];
240 CudaFunctions *cu = ctx->hwctx->internal->cuda_dl;
241 CUcontext dummy, cuda_ctx = ctx->hwctx->cuda_ctx;
243 AVFrame *input_main, *input_overlay;
245 ctx->cu_ctx = cuda_ctx;
247 // read main and overlay frames from inputs
248 ret = ff_framesync_dualinput_get(fs, &input_main, &input_overlay);
256 return ff_filter_frame(outlink, input_main);
258 ret = ff_inlink_make_frame_writable(inlink, &input_main);
260 av_frame_free(&input_main);
266 ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
268 av_frame_free(&input_main);
272 if (ctx->eval_mode == EVAL_MODE_FRAME) {
273 ctx->var_values[VAR_N] = inlink->frame_count_out;
274 ctx->var_values[VAR_T] = input_main->pts == AV_NOPTS_VALUE ?
275 NAN : input_main->pts * av_q2d(inlink->time_base);
278 FF_DISABLE_DEPRECATION_WARNINGS
280 int64_t pos = input_main->pkt_pos;
281 ctx->var_values[VAR_POS] = pos == -1 ? NAN : pos;
283 FF_ENABLE_DEPRECATION_WARNINGS
286 ctx->var_values[VAR_OVERLAY_W] = ctx->var_values[VAR_OW] = input_overlay->width;
287 ctx->var_values[VAR_OVERLAY_H] = ctx->var_values[VAR_OH] = input_overlay->height;
288 ctx->var_values[VAR_MAIN_W ] = ctx->var_values[VAR_MW] = input_main->width;
289 ctx->var_values[VAR_MAIN_H ] = ctx->var_values[VAR_MH] = input_main->height;
293 av_log(avctx, AV_LOG_DEBUG, "n:%f t:%f x:%f xi:%d y:%f yi:%d\n",
294 ctx->var_values[VAR_N], ctx->var_values[VAR_T],
295 ctx->var_values[VAR_X], ctx->x_position,
296 ctx->var_values[VAR_Y], ctx->y_position);
299 // overlay first plane
301 overlay_cuda_call_kernel(ctx,
302 ctx->x_position, ctx->y_position,
303 input_main->data[0], input_main->linesize[0],
304 input_main->width, input_main->height,
305 input_overlay->data[0], input_overlay->linesize[0],
306 input_overlay->width, input_overlay->height,
307 input_overlay->data[3], input_overlay->linesize[3], 1, 1);
309 // overlay rest planes depending on pixel format
311 switch(ctx->in_format_overlay) {
312 case AV_PIX_FMT_NV12:
313 overlay_cuda_call_kernel(ctx,
314 ctx->x_position, ctx->y_position / 2,
315 input_main->data[1], input_main->linesize[1],
316 input_main->width, input_main->height / 2,
317 input_overlay->data[1], input_overlay->linesize[1],
318 input_overlay->width, input_overlay->height / 2,
321 case AV_PIX_FMT_YUV420P:
322 case AV_PIX_FMT_YUVA420P:
323 overlay_cuda_call_kernel(ctx,
324 ctx->x_position / 2 , ctx->y_position / 2,
325 input_main->data[1], input_main->linesize[1],
326 input_main->width / 2, input_main->height / 2,
327 input_overlay->data[1], input_overlay->linesize[1],
328 input_overlay->width / 2, input_overlay->height / 2,
329 input_overlay->data[3], input_overlay->linesize[3], 2, 2);
331 overlay_cuda_call_kernel(ctx,
332 ctx->x_position / 2 , ctx->y_position / 2,
333 input_main->data[2], input_main->linesize[2],
334 input_main->width / 2, input_main->height / 2,
335 input_overlay->data[2], input_overlay->linesize[2],
336 input_overlay->width / 2, input_overlay->height / 2,
337 input_overlay->data[3], input_overlay->linesize[3], 2, 2);
340 av_log(ctx, AV_LOG_ERROR, "Passed unsupported overlay pixel format\n");
341 av_frame_free(&input_main);
342 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
346 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
348 return ff_filter_frame(outlink, input_main);
351 static int config_input_overlay(AVFilterLink *inlink)
353 AVFilterContext *ctx = inlink->dst;
354 OverlayCUDAContext *s = inlink->dst->priv;
358 /* Finish the configuration by evaluating the expressions
359 now when both inputs are configured. */
360 s->var_values[VAR_MAIN_W ] = s->var_values[VAR_MW] = ctx->inputs[MAIN ]->w;
361 s->var_values[VAR_MAIN_H ] = s->var_values[VAR_MH] = ctx->inputs[MAIN ]->h;
362 s->var_values[VAR_OVERLAY_W] = s->var_values[VAR_OW] = ctx->inputs[OVERLAY]->w;
363 s->var_values[VAR_OVERLAY_H] = s->var_values[VAR_OH] = ctx->inputs[OVERLAY]->h;
364 s->var_values[VAR_X] = NAN;
365 s->var_values[VAR_Y] = NAN;
366 s->var_values[VAR_N] = 0;
367 s->var_values[VAR_T] = NAN;
369 s->var_values[VAR_POS] = NAN;
372 if ((ret = set_expr(&s->x_pexpr, s->x_expr, "x", ctx)) < 0 ||
373 (ret = set_expr(&s->y_pexpr, s->y_expr, "y", ctx)) < 0)
376 if (s->eval_mode == EVAL_MODE_INIT) {
378 av_log(ctx, AV_LOG_VERBOSE, "x:%f xi:%d y:%f yi:%d\n",
379 s->var_values[VAR_X], s->x_position,
380 s->var_values[VAR_Y], s->y_position);
387 * Initialize overlay_cuda
389 static av_cold int overlay_cuda_init(AVFilterContext *avctx)
391 OverlayCUDAContext* ctx = avctx->priv;
392 ctx->fs.on_event = &overlay_cuda_blend;
398 * Uninitialize overlay_cuda
400 static av_cold void overlay_cuda_uninit(AVFilterContext *avctx)
402 OverlayCUDAContext* ctx = avctx->priv;
404 ff_framesync_uninit(&ctx->fs);
406 if (ctx->hwctx && ctx->cu_module) {
408 CudaFunctions *cu = ctx->hwctx->internal->cuda_dl;
409 CHECK_CU(cu->cuCtxPushCurrent(ctx->cu_ctx));
410 CHECK_CU(cu->cuModuleUnload(ctx->cu_module));
411 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
414 av_expr_free(ctx->x_pexpr); ctx->x_pexpr = NULL;
415 av_expr_free(ctx->y_pexpr); ctx->y_pexpr = NULL;
416 av_buffer_unref(&ctx->hw_device_ctx);
421 * Activate overlay_cuda
423 static int overlay_cuda_activate(AVFilterContext *avctx)
425 OverlayCUDAContext *ctx = avctx->priv;
427 return ff_framesync_activate(&ctx->fs);
433 static int overlay_cuda_config_output(AVFilterLink *outlink)
435 extern const unsigned char ff_vf_overlay_cuda_ptx_data[];
436 extern const unsigned int ff_vf_overlay_cuda_ptx_len;
439 AVFilterContext* avctx = outlink->src;
440 OverlayCUDAContext* ctx = avctx->priv;
442 AVFilterLink *inlink = avctx->inputs[0];
443 AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
445 AVFilterLink *inlink_overlay = avctx->inputs[1];
446 AVHWFramesContext *frames_ctx_overlay = (AVHWFramesContext*)inlink_overlay->hw_frames_ctx->data;
448 CUcontext dummy, cuda_ctx;
451 // check main input formats
454 av_log(ctx, AV_LOG_ERROR, "No hw context provided on main input\n");
455 return AVERROR(EINVAL);
458 ctx->in_format_main = frames_ctx->sw_format;
459 if (!format_is_supported(supported_main_formats, ctx->in_format_main)) {
460 av_log(ctx, AV_LOG_ERROR, "Unsupported main input format: %s\n",
461 av_get_pix_fmt_name(ctx->in_format_main));
462 return AVERROR(ENOSYS);
465 // check overlay input formats
467 if (!frames_ctx_overlay) {
468 av_log(ctx, AV_LOG_ERROR, "No hw context provided on overlay input\n");
469 return AVERROR(EINVAL);
472 ctx->in_format_overlay = frames_ctx_overlay->sw_format;
473 if (!format_is_supported(supported_overlay_formats, ctx->in_format_overlay)) {
474 av_log(ctx, AV_LOG_ERROR, "Unsupported overlay input format: %s\n",
475 av_get_pix_fmt_name(ctx->in_format_overlay));
476 return AVERROR(ENOSYS);
479 // check we can overlay pictures with those pixel formats
481 if (!formats_match(ctx->in_format_main, ctx->in_format_overlay)) {
482 av_log(ctx, AV_LOG_ERROR, "Can't overlay %s on %s \n",
483 av_get_pix_fmt_name(ctx->in_format_overlay), av_get_pix_fmt_name(ctx->in_format_main));
484 return AVERROR(EINVAL);
489 ctx->hw_device_ctx = av_buffer_ref(frames_ctx->device_ref);
490 if (!ctx->hw_device_ctx)
491 return AVERROR(ENOMEM);
492 ctx->hwctx = ((AVHWDeviceContext*)ctx->hw_device_ctx->data)->hwctx;
494 cuda_ctx = ctx->hwctx->cuda_ctx;
495 ctx->fs.time_base = inlink->time_base;
497 ctx->cu_stream = ctx->hwctx->stream;
499 outlink->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
500 if (!outlink->hw_frames_ctx)
501 return AVERROR(ENOMEM);
505 cu = ctx->hwctx->internal->cuda_dl;
507 err = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
512 err = ff_cuda_load_module(ctx, ctx->hwctx, &ctx->cu_module, ff_vf_overlay_cuda_ptx_data, ff_vf_overlay_cuda_ptx_len);
514 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
518 err = CHECK_CU(cu->cuModuleGetFunction(&ctx->cu_func, ctx->cu_module, "Overlay_Cuda"));
520 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
524 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
528 err = ff_framesync_init_dualinput(&ctx->fs, avctx);
533 return ff_framesync_configure(&ctx->fs);
537 #define OFFSET(x) offsetof(OverlayCUDAContext, x)
538 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
540 static const AVOption overlay_cuda_options[] = {
541 { "x", "set the x expression of overlay", OFFSET(x_expr), AV_OPT_TYPE_STRING, { .str = "0" }, 0, 0, FLAGS },
542 { "y", "set the y expression of overlay", OFFSET(y_expr), AV_OPT_TYPE_STRING, { .str = "0" }, 0, 0, FLAGS },
543 { "eof_action", "Action to take when encountering EOF from secondary input ",
544 OFFSET(fs.opt_eof_action), AV_OPT_TYPE_INT, { .i64 = EOF_ACTION_REPEAT },
545 EOF_ACTION_REPEAT, EOF_ACTION_PASS, .flags = FLAGS, "eof_action" },
546 { "repeat", "Repeat the previous frame.", 0, AV_OPT_TYPE_CONST, { .i64 = EOF_ACTION_REPEAT }, .flags = FLAGS, "eof_action" },
547 { "endall", "End both streams.", 0, AV_OPT_TYPE_CONST, { .i64 = EOF_ACTION_ENDALL }, .flags = FLAGS, "eof_action" },
548 { "pass", "Pass through the main input.", 0, AV_OPT_TYPE_CONST, { .i64 = EOF_ACTION_PASS }, .flags = FLAGS, "eof_action" },
549 { "eval", "specify when to evaluate expressions", OFFSET(eval_mode), AV_OPT_TYPE_INT, { .i64 = EVAL_MODE_FRAME }, 0, EVAL_MODE_NB - 1, FLAGS, "eval" },
550 { "init", "eval expressions once during initialization", 0, AV_OPT_TYPE_CONST, { .i64=EVAL_MODE_INIT }, .flags = FLAGS, .unit = "eval" },
551 { "frame", "eval expressions per-frame", 0, AV_OPT_TYPE_CONST, { .i64=EVAL_MODE_FRAME }, .flags = FLAGS, .unit = "eval" },
552 { "shortest", "force termination when the shortest input terminates", OFFSET(fs.opt_shortest), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
553 { "repeatlast", "repeat overlay of the last overlay frame", OFFSET(fs.opt_repeatlast), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS },
557 FRAMESYNC_DEFINE_CLASS(overlay_cuda, OverlayCUDAContext, fs);
559 static const AVFilterPad overlay_cuda_inputs[] = {
562 .type = AVMEDIA_TYPE_VIDEO,
566 .type = AVMEDIA_TYPE_VIDEO,
567 .config_props = config_input_overlay,
571 static const AVFilterPad overlay_cuda_outputs[] = {
574 .type = AVMEDIA_TYPE_VIDEO,
575 .config_props = &overlay_cuda_config_output,
579 const AVFilter ff_vf_overlay_cuda = {
580 .name = "overlay_cuda",
581 .description = NULL_IF_CONFIG_SMALL("Overlay one video on top of another using CUDA"),
582 .priv_size = sizeof(OverlayCUDAContext),
583 .priv_class = &overlay_cuda_class,
584 .init = &overlay_cuda_init,
585 .uninit = &overlay_cuda_uninit,
586 .activate = &overlay_cuda_activate,
587 FILTER_INPUTS(overlay_cuda_inputs),
588 FILTER_OUTPUTS(overlay_cuda_outputs),
589 FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
590 .preinit = overlay_cuda_framesync_preinit,
591 .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,