From 805b1ee9b33c0dadc22e22fc9a7592f784a3f8d7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Sebastian=20Dr=C3=B6ge?= Date: Tue, 4 Aug 2020 12:53:18 +0300 Subject: [PATCH] video-converter: Add fast paths from v210 to I420/YV12, Y42B, UYVY and YUY2 These then don't require going through the generic code path via AYUV64 first but can be converted directly. This speeds up processing of videotestsrc ! v210 ! videoconvert ! other_format ! fakesink by a factor of 1.55 for I420/YV12 and 1.40 for the other destination formats and reduces memory pressure considerably. Part-of: --- gst-libs/gst/video/video-converter.c | 515 +++++++++++++++++++++++++++++++++++ 1 file changed, 515 insertions(+) diff --git a/gst-libs/gst/video/video-converter.c b/gst-libs/gst/video/video-converter.c index c321396..2ee9e81 100644 --- a/gst-libs/gst/video/video-converter.c +++ b/gst-libs/gst/video/video-converter.c @@ -3597,6 +3597,165 @@ convert_YUY2_I420 (GstVideoConverter * convert, const GstVideoFrame * src, } } +static void +convert_v210_I420_task (FConvertTask * task) +{ + gint i, j; + gint l1, l2; + guint8 *d_y1, *d_y2, *d_u, *d_v; + const guint8 *s1, *s2; + guint32 a0, a1, a2, a3; + guint16 y0_1, y1_1, y2_1, y3_1, y4_1, y5_1; + guint16 u0_1, u2_1, u4_1; + guint16 v0_1, v2_1, v4_1; + guint16 y0_2, y1_2, y2_2, y3_2, y4_2, y5_2; + guint16 u0_2, u2_2, u4_2; + guint16 v0_2, v2_2, v4_2; + + for (i = task->height_0; i < task->height_1; i += 2) { + GET_LINE_OFFSETS (task->interlaced, i, l1, l2); + + d_y1 = FRAME_GET_Y_LINE (task->dest, l1); + d_y2 = FRAME_GET_Y_LINE (task->dest, l2); + d_u = FRAME_GET_U_LINE (task->dest, i >> 1); + d_v = FRAME_GET_V_LINE (task->dest, i >> 1); + + s1 = FRAME_GET_LINE (task->src, l1); + s2 = FRAME_GET_LINE (task->src, l2); + + for (j = 0; j < task->width; j += 6) { + a0 = GST_READ_UINT32_LE (s1 + (j / 6) * 16 + 0); + a1 = GST_READ_UINT32_LE (s1 + (j / 6) * 16 + 4); + a2 = GST_READ_UINT32_LE (s1 + (j / 6) * 16 + 8); + a3 = GST_READ_UINT32_LE (s1 + (j / 6) * 16 + 12); + + u0_1 = ((a0 >> 0) & 0x3ff) >> 2; + y0_1 = ((a0 >> 10) & 0x3ff) >> 2; + v0_1 = ((a0 >> 20) & 0x3ff) >> 2; + y1_1 = ((a1 >> 0) & 0x3ff) >> 2; + + u2_1 = ((a1 >> 10) & 0x3ff) >> 2; + y2_1 = ((a1 >> 20) & 0x3ff) >> 2; + v2_1 = ((a2 >> 0) & 0x3ff) >> 2; + y3_1 = ((a2 >> 10) & 0x3ff) >> 2; + + u4_1 = ((a2 >> 20) & 0x3ff) >> 2; + y4_1 = ((a3 >> 0) & 0x3ff) >> 2; + v4_1 = ((a3 >> 10) & 0x3ff) >> 2; + y5_1 = ((a3 >> 20) & 0x3ff) >> 2; + + a0 = GST_READ_UINT32_LE (s2 + (j / 6) * 16 + 0); + a1 = GST_READ_UINT32_LE (s2 + (j / 6) * 16 + 4); + a2 = GST_READ_UINT32_LE (s2 + (j / 6) * 16 + 8); + a3 = GST_READ_UINT32_LE (s2 + (j / 6) * 16 + 12); + + u0_2 = ((a0 >> 0) & 0x3ff) >> 2; + y0_2 = ((a0 >> 10) & 0x3ff) >> 2; + v0_2 = ((a0 >> 20) & 0x3ff) >> 2; + y1_2 = ((a1 >> 0) & 0x3ff) >> 2; + + u2_2 = ((a1 >> 10) & 0x3ff) >> 2; + y2_2 = ((a1 >> 20) & 0x3ff) >> 2; + v2_2 = ((a2 >> 0) & 0x3ff) >> 2; + y3_2 = ((a2 >> 10) & 0x3ff) >> 2; + + u4_2 = ((a2 >> 20) & 0x3ff) >> 2; + y4_2 = ((a3 >> 0) & 0x3ff) >> 2; + v4_2 = ((a3 >> 10) & 0x3ff) >> 2; + y5_2 = ((a3 >> 20) & 0x3ff) >> 2; + + d_y1[j] = y0_1; + d_y2[j] = y0_2; + d_u[j / 2] = (u0_1 + u0_2) / 2; + d_v[j / 2] = (v0_1 + v0_2) / 2; + + if (j < task->width - 1) { + d_y1[j + 1] = y1_1; + d_y2[j + 1] = y1_2; + } + + if (j < task->width - 2) { + d_y1[j + 2] = y2_1; + d_y2[j + 2] = y2_2; + d_u[j / 2 + 1] = (u2_1 + u2_2) / 2; + d_v[j / 2 + 1] = (v2_1 + v2_2) / 2; + } + + if (j < task->width - 3) { + d_y1[j + 3] = y3_1; + d_y2[j + 3] = y3_2; + } + + if (j < task->width - 4) { + d_y1[j + 4] = y4_1; + d_y2[j + 4] = y4_2; + d_u[j / 2 + 2] = (u4_1 + u4_2) / 2; + d_v[j / 2 + 2] = (v4_1 + v4_2) / 2; + } + + if (j < task->width - 5) { + d_y1[j + 5] = y5_1; + d_y2[j + 5] = y5_2; + } + } + } +} + +static void +convert_v210_I420 (GstVideoConverter * convert, const GstVideoFrame * src, + GstVideoFrame * dest) +{ + int i; + gint width = convert->in_width; + gint height = convert->in_height; + gboolean interlaced = GST_VIDEO_FRAME_IS_INTERLACED (src); + gint h2; + FConvertTask *tasks; + FConvertTask **tasks_p; + gint n_threads; + gint lines_per_thread; + + /* I420 has half as many chroma lines, as such we have to + * always merge two into one. For non-interlaced these are + * the two next to each other, for interlaced one is skipped + * in between. */ + if (interlaced) + h2 = GST_ROUND_DOWN_4 (height); + else + h2 = GST_ROUND_DOWN_2 (height); + + n_threads = convert->conversion_runner->n_threads; + tasks = g_newa (FConvertTask, n_threads); + tasks_p = g_newa (FConvertTask *, n_threads); + + lines_per_thread = GST_ROUND_UP_2 ((h2 + n_threads - 1) / n_threads); + + for (i = 0; i < n_threads; i++) { + tasks[i].src = src; + tasks[i].dest = dest; + + tasks[i].interlaced = interlaced; + tasks[i].width = width; + + tasks[i].height_0 = i * lines_per_thread; + tasks[i].height_1 = tasks[i].height_0 + lines_per_thread; + tasks[i].height_1 = MIN (h2, tasks[i].height_1); + + tasks_p[i] = &tasks[i]; + } + + gst_parallelized_task_runner_run (convert->conversion_runner, + (GstParallelizedTaskFunc) convert_v210_I420_task, (gpointer) tasks_p); + + /* now handle last lines. For interlaced these are up to 3 */ + if (h2 != height) { + for (i = h2; i < height; i++) { + UNPACK_FRAME (src, convert->tmpline[0], i, convert->in_x, width); + PACK_FRAME (dest, convert->tmpline[0], i, width); + } + } +} + typedef struct { const guint8 *s, *s2, *su, *sv; @@ -3786,6 +3945,128 @@ convert_YUY2_Y444 (GstVideoConverter * convert, const GstVideoFrame * src, } static void +convert_v210_Y42B_task (FConvertPlaneTask * task) +{ + gint i, j; + guint8 *d_y, *d_u, *d_v; + const guint8 *s; + guint32 a0, a1, a2, a3; + guint16 y0, y1, y2, y3, y4, y5; + guint16 u0, u2, u4; + guint16 v0, v2, v4; + + for (i = 0; i < task->height; i++) { + d_y = task->d + i * task->dstride; + d_u = task->du + i * task->dustride; + d_v = task->dv + i * task->dvstride; + s = task->s + i * task->sstride; + + for (j = 0; j < task->width; j += 6) { + a0 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 0); + a1 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 4); + a2 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 8); + a3 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 12); + + u0 = ((a0 >> 0) & 0x3ff) >> 2; + y0 = ((a0 >> 10) & 0x3ff) >> 2; + v0 = ((a0 >> 20) & 0x3ff) >> 2; + y1 = ((a1 >> 0) & 0x3ff) >> 2; + + u2 = ((a1 >> 10) & 0x3ff) >> 2; + y2 = ((a1 >> 20) & 0x3ff) >> 2; + v2 = ((a2 >> 0) & 0x3ff) >> 2; + y3 = ((a2 >> 10) & 0x3ff) >> 2; + + u4 = ((a2 >> 20) & 0x3ff) >> 2; + y4 = ((a3 >> 0) & 0x3ff) >> 2; + v4 = ((a3 >> 10) & 0x3ff) >> 2; + y5 = ((a3 >> 20) & 0x3ff) >> 2; + + d_y[j] = y0; + d_u[j / 2] = u0; + d_v[j / 2] = v0; + + if (j < task->width - 1) { + d_y[j + 1] = y1; + } + + if (j < task->width - 2) { + d_y[j + 2] = y2; + d_u[j / 2 + 1] = u2; + d_v[j / 2 + 1] = v2; + } + + if (j < task->width - 3) { + d_y[j + 3] = y3; + } + + if (j < task->width - 4) { + d_y[j + 4] = y4; + d_u[j / 2 + 2] = u4; + d_v[j / 2 + 2] = v4; + } + + if (j < task->width - 5) { + d_y[j + 5] = y5; + } + } + } +} + +static void +convert_v210_Y42B (GstVideoConverter * convert, const GstVideoFrame * src, + GstVideoFrame * dest) +{ + gint width = convert->in_width; + gint height = convert->in_height; + guint8 *s, *dy, *du, *dv; + FConvertPlaneTask *tasks; + FConvertPlaneTask **tasks_p; + gint n_threads; + gint lines_per_thread; + gint i; + + s = FRAME_GET_LINE (src, convert->in_y); + s += (GST_ROUND_UP_2 (convert->in_x) * 2); + + dy = FRAME_GET_Y_LINE (dest, convert->out_y); + dy += convert->out_x; + du = FRAME_GET_U_LINE (dest, convert->out_y); + du += convert->out_x >> 1; + dv = FRAME_GET_V_LINE (dest, convert->out_y); + dv += convert->out_x >> 1; + + n_threads = convert->conversion_runner->n_threads; + tasks = g_newa (FConvertPlaneTask, n_threads); + tasks_p = g_newa (FConvertPlaneTask *, n_threads); + + lines_per_thread = (height + n_threads - 1) / n_threads; + + for (i = 0; i < n_threads; i++) { + tasks[i].dstride = FRAME_GET_Y_STRIDE (dest); + tasks[i].dustride = FRAME_GET_U_STRIDE (dest); + tasks[i].dvstride = FRAME_GET_V_STRIDE (dest); + tasks[i].sstride = FRAME_GET_STRIDE (src); + tasks[i].d = dy + i * lines_per_thread * tasks[i].dstride; + tasks[i].du = du + i * lines_per_thread * tasks[i].dustride; + tasks[i].dv = dv + i * lines_per_thread * tasks[i].dvstride; + tasks[i].s = s + i * lines_per_thread * tasks[i].sstride; + + tasks[i].width = width; + tasks[i].height = (i + 1) * lines_per_thread; + tasks[i].height = MIN (tasks[i].height, height); + tasks[i].height -= i * lines_per_thread; + + tasks_p[i] = &tasks[i]; + } + + gst_parallelized_task_runner_run (convert->conversion_runner, + (GstParallelizedTaskFunc) convert_v210_Y42B_task, (gpointer) tasks_p); + + convert_fill_border (convert, dest); +} + +static void convert_UYVY_I420_task (FConvertTask * task) { gint i; @@ -3963,6 +4244,228 @@ convert_UYVY_YUY2 (GstVideoConverter * convert, const GstVideoFrame * src, } static void +convert_v210_UYVY_task (FConvertPlaneTask * task) +{ + gint i, j; + guint8 *d; + const guint8 *s; + guint32 a0, a1, a2, a3; + guint16 y0, y1, y2, y3, y4, y5; + guint16 u0, u2, u4; + guint16 v0, v2, v4; + + for (i = 0; i < task->height; i++) { + d = task->d + i * task->dstride; + s = task->s + i * task->sstride; + + for (j = 0; j < task->width; j += 6) { + a0 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 0); + a1 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 4); + a2 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 8); + a3 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 12); + + u0 = ((a0 >> 0) & 0x3ff) >> 2; + y0 = ((a0 >> 10) & 0x3ff) >> 2; + v0 = ((a0 >> 20) & 0x3ff) >> 2; + y1 = ((a1 >> 0) & 0x3ff) >> 2; + + u2 = ((a1 >> 10) & 0x3ff) >> 2; + y2 = ((a1 >> 20) & 0x3ff) >> 2; + v2 = ((a2 >> 0) & 0x3ff) >> 2; + y3 = ((a2 >> 10) & 0x3ff) >> 2; + + u4 = ((a2 >> 20) & 0x3ff) >> 2; + y4 = ((a3 >> 0) & 0x3ff) >> 2; + v4 = ((a3 >> 10) & 0x3ff) >> 2; + y5 = ((a3 >> 20) & 0x3ff) >> 2; + + d[2 * j + 1] = y0; + d[2 * j] = u0; + d[2 * j + 2] = v0; + + if (j < task->width - 1) { + d[2 * j + 3] = y1; + } + + if (j < task->width - 2) { + d[2 * j + 5] = y2; + d[2 * j + 4] = u2; + d[2 * j + 6] = v2; + } + + if (j < task->width - 3) { + d[2 * j + 7] = y3; + } + + if (j < task->width - 4) { + d[2 * j + 9] = y4; + d[2 * j + 8] = u4; + d[2 * j + 10] = v4; + } + + if (j < task->width - 5) { + d[2 * j + 11] = y5; + } + } + } +} + +static void +convert_v210_UYVY (GstVideoConverter * convert, const GstVideoFrame * src, + GstVideoFrame * dest) +{ + gint width = convert->in_width; + gint height = convert->in_height; + guint8 *s, *d; + FConvertPlaneTask *tasks; + FConvertPlaneTask **tasks_p; + gint n_threads; + gint lines_per_thread; + gint i; + + s = FRAME_GET_LINE (src, convert->in_y); + s += (GST_ROUND_UP_2 (convert->in_x) * 2); + d = FRAME_GET_LINE (dest, convert->out_y); + d += (GST_ROUND_UP_2 (convert->out_x) * 2); + + n_threads = convert->conversion_runner->n_threads; + tasks = g_newa (FConvertPlaneTask, n_threads); + tasks_p = g_newa (FConvertPlaneTask *, n_threads); + + lines_per_thread = (height + n_threads - 1) / n_threads; + + for (i = 0; i < n_threads; i++) { + tasks[i].dstride = FRAME_GET_STRIDE (dest); + tasks[i].sstride = FRAME_GET_STRIDE (src); + tasks[i].d = d + i * lines_per_thread * tasks[i].dstride; + tasks[i].s = s + i * lines_per_thread * tasks[i].sstride; + + tasks[i].width = width; + tasks[i].height = (i + 1) * lines_per_thread; + tasks[i].height = MIN (tasks[i].height, height); + tasks[i].height -= i * lines_per_thread; + + tasks_p[i] = &tasks[i]; + } + + gst_parallelized_task_runner_run (convert->conversion_runner, + (GstParallelizedTaskFunc) convert_v210_UYVY_task, (gpointer) tasks_p); + + convert_fill_border (convert, dest); +} + +static void +convert_v210_YUY2_task (FConvertPlaneTask * task) +{ + gint i, j; + guint8 *d; + const guint8 *s; + guint32 a0, a1, a2, a3; + guint16 y0, y1, y2, y3, y4, y5; + guint16 u0, u2, u4; + guint16 v0, v2, v4; + + for (i = 0; i < task->height; i++) { + d = task->d + i * task->dstride; + s = task->s + i * task->sstride; + + for (j = 0; j < task->width; j += 6) { + a0 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 0); + a1 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 4); + a2 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 8); + a3 = GST_READ_UINT32_LE (s + (j / 6) * 16 + 12); + + u0 = ((a0 >> 0) & 0x3ff) >> 2; + y0 = ((a0 >> 10) & 0x3ff) >> 2; + v0 = ((a0 >> 20) & 0x3ff) >> 2; + y1 = ((a1 >> 0) & 0x3ff) >> 2; + + u2 = ((a1 >> 10) & 0x3ff) >> 2; + y2 = ((a1 >> 20) & 0x3ff) >> 2; + v2 = ((a2 >> 0) & 0x3ff) >> 2; + y3 = ((a2 >> 10) & 0x3ff) >> 2; + + u4 = ((a2 >> 20) & 0x3ff) >> 2; + y4 = ((a3 >> 0) & 0x3ff) >> 2; + v4 = ((a3 >> 10) & 0x3ff) >> 2; + y5 = ((a3 >> 20) & 0x3ff) >> 2; + + d[2 * j] = y0; + d[2 * j + 1] = u0; + d[2 * j + 3] = v0; + + if (j < task->width - 1) { + d[2 * j + 2] = y1; + } + + if (j < task->width - 2) { + d[2 * j + 4] = y2; + d[2 * j + 5] = u2; + d[2 * j + 7] = v2; + } + + if (j < task->width - 3) { + d[2 * j + 6] = y3; + } + + if (j < task->width - 4) { + d[2 * j + 8] = y4; + d[2 * j + 9] = u4; + d[2 * j + 11] = v4; + } + + if (j < task->width - 5) { + d[2 * j + 10] = y5; + } + } + } +} + +static void +convert_v210_YUY2 (GstVideoConverter * convert, const GstVideoFrame * src, + GstVideoFrame * dest) +{ + gint width = convert->in_width; + gint height = convert->in_height; + guint8 *s, *d; + FConvertPlaneTask *tasks; + FConvertPlaneTask **tasks_p; + gint n_threads; + gint lines_per_thread; + gint i; + + s = FRAME_GET_LINE (src, convert->in_y); + s += (GST_ROUND_UP_2 (convert->in_x) * 2); + d = FRAME_GET_LINE (dest, convert->out_y); + d += (GST_ROUND_UP_2 (convert->out_x) * 2); + + n_threads = convert->conversion_runner->n_threads; + tasks = g_newa (FConvertPlaneTask, n_threads); + tasks_p = g_newa (FConvertPlaneTask *, n_threads); + + lines_per_thread = (height + n_threads - 1) / n_threads; + + for (i = 0; i < n_threads; i++) { + tasks[i].dstride = FRAME_GET_STRIDE (dest); + tasks[i].sstride = FRAME_GET_STRIDE (src); + tasks[i].d = d + i * lines_per_thread * tasks[i].dstride; + tasks[i].s = s + i * lines_per_thread * tasks[i].sstride; + + tasks[i].width = width; + tasks[i].height = (i + 1) * lines_per_thread; + tasks[i].height = MIN (tasks[i].height, height); + tasks[i].height -= i * lines_per_thread; + + tasks_p[i] = &tasks[i]; + } + + gst_parallelized_task_runner_run (convert->conversion_runner, + (GstParallelizedTaskFunc) convert_v210_YUY2_task, (gpointer) tasks_p); + + convert_fill_border (convert, dest); +} + +static void convert_UYVY_Y42B_task (FConvertPlaneTask * task) { video_orc_convert_UYVY_Y42B (task->d, task->dstride, task->du, @@ -6372,6 +6875,11 @@ static const VideoTransform transforms[] = { {GST_VIDEO_FORMAT_AYUV, GST_VIDEO_FORMAT_UYVY, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, 1, 0, convert_AYUV_UYVY}, + {GST_VIDEO_FORMAT_v210, GST_VIDEO_FORMAT_UYVY, TRUE, FALSE, TRUE, FALSE, + TRUE, FALSE, FALSE, FALSE, 0, 0, convert_v210_UYVY}, + {GST_VIDEO_FORMAT_v210, GST_VIDEO_FORMAT_YUY2, TRUE, FALSE, TRUE, FALSE, + TRUE, FALSE, FALSE, FALSE, 0, 0, convert_v210_YUY2}, + /* packed -> planar */ {GST_VIDEO_FORMAT_YUY2, GST_VIDEO_FORMAT_I420, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, 0, 0, convert_YUY2_I420}, @@ -6402,6 +6910,13 @@ static const VideoTransform transforms[] = { {GST_VIDEO_FORMAT_AYUV, GST_VIDEO_FORMAT_Y444, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, 0, 0, convert_AYUV_Y444}, + {GST_VIDEO_FORMAT_v210, GST_VIDEO_FORMAT_I420, TRUE, FALSE, TRUE, FALSE, + FALSE, FALSE, FALSE, FALSE, 0, 0, convert_v210_I420}, + {GST_VIDEO_FORMAT_v210, GST_VIDEO_FORMAT_YV12, TRUE, FALSE, TRUE, FALSE, + FALSE, FALSE, FALSE, FALSE, 0, 0, convert_v210_I420}, + {GST_VIDEO_FORMAT_v210, GST_VIDEO_FORMAT_Y42B, TRUE, FALSE, TRUE, FALSE, + FALSE, FALSE, FALSE, FALSE, 0, 0, convert_v210_Y42B}, + /* planar -> planar */ {GST_VIDEO_FORMAT_I420, GST_VIDEO_FORMAT_I420, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, 0, 0, convert_scale_planes}, -- 2.7.4