From 56d47f641f7764f0842240f19fc5229fccfc6ad9 Mon Sep 17 00:00:00 2001
From: Seungha Yang <seungha@centricular.com>
Date: Wed, 24 Nov 2021 20:21:52 +0900
Subject: [PATCH] compositor: Add support for all formats

For formats which we don't have fast-path implementation, compositor
will convert it to common unpack formats (AYUV, ARGB, AYUV64 and ARGB64)
then blending will happen using the intermediate formats.
Finally blended image will be converted back to the selected output format
if required.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/1486>
---
 .../gst-plugins-base/gst/compositor/blend.c        | 700 +++++++++++++++++++++
 .../gst-plugins-base/gst/compositor/blend.h        |  10 +
 .../gst-plugins-base/gst/compositor/compositor.c   | 188 +++++-
 .../gst-plugins-base/gst/compositor/compositor.h   |   4 +
 4 files changed, 873 insertions(+), 29 deletions(-)

diff --git a/subprojects/gst-plugins-base/gst/compositor/blend.c b/subprojects/gst-plugins-base/gst/compositor/blend.c
index 7bb33f7..a7091ae 100644
--- a/subprojects/gst-plugins-base/gst/compositor/blend.c
+++ b/subprojects/gst-plugins-base/gst/compositor/blend.c
@@ -696,6 +696,693 @@ PLANAR_YUV_HIGH_FILL_COLOR (y444_16le, LE, compositor_orc_memset_u16_2d);
 PLANAR_YUV_HIGH_FILL_CHECKER (y444_16be, 16, BE, compositor_orc_memset_u16_2d);
 PLANAR_YUV_HIGH_FILL_COLOR (y444_16be, BE, compositor_orc_memset_u16_2d);
 
+/* TODO: port to ORC */
+#if G_BYTE_ORDER == G_LITTLE_ENDIAN
+static void
+compositor_blend_argb64 (guint8 * ORC_RESTRICT d1, int d1_stride,
+    const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int n, int m)
+{
+  gint i, j, k;
+  const guint64 comp_mask_0 = 0xffff000000000000;
+  const guint64 comp_mask_1 = 0x0000ffff00000000;
+  const guint64 comp_mask_2 = 0x00000000ffff0000;
+  const guint64 comp_mask_alpha = 0x000000000000ffff;
+
+  for (j = 0; j < m; j++) {
+    guint64 *dst;
+    guint64 *src;
+
+    dst = (guint64 *) (d1 + (d1_stride * j));
+    src = (guint64 *) (s1 + (s1_stride * j));
+
+    for (i = 0; i < n; i++) {
+      guint64 src_val = src[i];
+      guint64 dst_val = dst[i];
+      guint64 src_comp[3];
+      guint64 dst_comp[3];
+      guint64 src_alpha;
+      guint64 src_alpha_inv;
+
+      src_comp[0] = (src_val & comp_mask_0) >> 48;
+      src_comp[1] = (src_val & comp_mask_1) >> 32;
+      src_comp[2] = (src_val & comp_mask_2) >> 16;
+
+      dst_comp[0] = (dst_val & comp_mask_0) >> 48;
+      dst_comp[1] = (dst_val & comp_mask_1) >> 32;
+      dst_comp[2] = (dst_val & comp_mask_2) >> 16;
+
+      src_alpha = src_val & comp_mask_alpha;
+      src_alpha *= p1;
+      src_alpha /= G_MAXUINT16;
+      src_alpha = CLAMP (src_alpha, 0, G_MAXUINT16);
+      src_alpha_inv = G_MAXUINT16 - src_alpha;
+
+      for (k = 0; k < G_N_ELEMENTS (src_comp); k++) {
+        src_comp[k] *= src_alpha;
+        dst_comp[k] *= src_alpha_inv;
+        dst_comp[k] += src_comp[k];
+        dst_comp[k] /= G_MAXUINT16;
+
+        dst_comp[k] = CLAMP (dst_comp[k], 0, G_MAXUINT16);
+      }
+
+      dst_val = (dst_comp[0] << 48) | (dst_comp[1] << 32) | (dst_comp[2] << 16)
+          | comp_mask_alpha;
+      dst[i] = dst_val;
+    }
+  }
+}
+
+static void
+compositor_source_argb64 (guint8 * ORC_RESTRICT d1, int d1_stride,
+    const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int n, int m)
+{
+  gint i, j;
+  const guint64 comp_mask_non_alpha = 0xffffffffffff0000;
+  const guint64 comp_mask_alpha = 0x000000000000ffff;
+
+  for (j = 0; j < m; j++) {
+    guint64 *dst;
+    guint64 *src;
+
+    dst = (guint64 *) (d1 + (d1_stride * j));
+    src = (guint64 *) (s1 + (s1_stride * j));
+
+    for (i = 0; i < n; i++) {
+      guint64 src_val = src[i];
+      guint64 dst_val;
+      guint64 src_alpha;
+
+      src_alpha = src_val & comp_mask_alpha;
+      src_alpha *= p1;
+      src_alpha /= G_MAXUINT16;
+      src_alpha = CLAMP (src_alpha, 0, G_MAXUINT16);
+
+      dst_val = (src_val & comp_mask_non_alpha) | src_alpha;
+      dst[i] = dst_val;
+    }
+  }
+}
+
+static void
+compositor_overlay_argb64 (guint8 * ORC_RESTRICT d1, int d1_stride,
+    const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int n, int m)
+{
+  gint i, j, k;
+  const guint64 comp_mask_0 = 0xffff000000000000;
+  const guint64 comp_mask_1 = 0x0000ffff00000000;
+  const guint64 comp_mask_2 = 0x00000000ffff0000;
+  const guint64 comp_mask_alpha = 0x000000000000ffff;
+
+  for (j = 0; j < m; j++) {
+    guint64 *dst;
+    guint64 *src;
+
+    dst = (guint64 *) (d1 + (d1_stride * j));
+    src = (guint64 *) (s1 + (s1_stride * j));
+
+    for (i = 0; i < n; i++) {
+      guint64 src_val = src[i];
+      guint64 dst_val = dst[i];
+      guint64 src_comp[3];
+      guint64 dst_comp[3];
+      guint64 src_alpha;
+      guint64 src_alpha_inv;
+      guint64 dst_alpha;
+
+      src_comp[0] = (src_val & comp_mask_0) >> 48;
+      src_comp[1] = (src_val & comp_mask_1) >> 32;
+      src_comp[2] = (src_val & comp_mask_2) >> 16;
+
+      dst_comp[0] = (dst_val & comp_mask_0) >> 48;
+      dst_comp[1] = (dst_val & comp_mask_1) >> 32;
+      dst_comp[2] = (dst_val & comp_mask_2) >> 16;
+
+      /* calc source alpha as alpha_s = alpha_s * alpha / 255 */
+      src_alpha = src_val & comp_mask_alpha;
+      src_alpha *= p1;
+      src_alpha /= G_MAXUINT16;
+      src_alpha = CLAMP (src_alpha, 0, G_MAXUINT16);
+      src_alpha_inv = G_MAXUINT16 - src_alpha;
+
+      for (k = 0; k < G_N_ELEMENTS (src_comp); k++)
+        src_comp[k] *= src_alpha;
+
+      /* calc destination alpha as alpha_d = (1.0 - alpha_s) * alpha_d / 1.0 */
+      dst_alpha = dst_val & comp_mask_alpha;
+      dst_alpha *= src_alpha_inv;
+      dst_alpha /= G_MAXUINT16;
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++)
+        dst_comp[k] *= dst_alpha;
+
+      /* calc final pixel as pix_d = pix_s*alpha_s + pix_d*alpha_d*(255-alpha_s)/255 */
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++)
+        dst_comp[k] += src_comp[k];
+
+      /* calc the final destination alpha_d = alpha_s + alpha_d * (255-alpha_s)/255 */
+      dst_alpha += src_alpha;
+      dst_alpha = CLAMP (dst_alpha, 0, G_MAXUINT16);
+
+      /* now normalize the pix_d by the final alpha to make it associative */
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++) {
+        if (dst_alpha > 0)
+          dst_comp[k] /= dst_alpha;
+        dst_comp[k] = CLAMP (dst_comp[k], 0, G_MAXUINT16);
+      }
+
+      dst_val = (dst_comp[0] << 48) | (dst_comp[1] << 32) | (dst_comp[2] << 16)
+          | dst_alpha;
+      dst[i] = dst_val;
+    }
+  }
+}
+
+static void
+compositor_overlay_argb64_addition (guint8 * ORC_RESTRICT d1, int d1_stride,
+    const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int n, int m)
+{
+  gint i, j, k;
+  const guint64 comp_mask_0 = 0xffff000000000000;
+  const guint64 comp_mask_1 = 0x0000ffff00000000;
+  const guint64 comp_mask_2 = 0x00000000ffff0000;
+  const guint64 comp_mask_alpha = 0x000000000000ffff;
+
+  for (j = 0; j < m; j++) {
+    guint64 *dst;
+    guint64 *src;
+
+    dst = (guint64 *) (d1 + (d1_stride * j));
+    src = (guint64 *) (s1 + (s1_stride * j));
+
+    for (i = 0; i < n; i++) {
+      guint64 src_val = src[i];
+      guint64 dst_val = dst[i];
+      guint64 src_comp[3];
+      guint64 dst_comp[3];
+      guint64 src_alpha;
+      guint64 src_alpha_inv;
+      guint64 alpha_factor;
+      guint64 dst_alpha;
+
+      src_comp[0] = (src_val & comp_mask_0) >> 48;
+      src_comp[1] = (src_val & comp_mask_1) >> 32;
+      src_comp[2] = (src_val & comp_mask_2) >> 16;
+
+      dst_comp[0] = (dst_val & comp_mask_0) >> 48;
+      dst_comp[1] = (dst_val & comp_mask_1) >> 32;
+      dst_comp[2] = (dst_val & comp_mask_2) >> 16;
+
+      /* calc source alpha as alpha_s = alpha_s * alpha / 255 */
+      src_alpha = src_val & comp_mask_alpha;
+      src_alpha *= p1;
+      src_alpha /= G_MAXUINT16;
+      src_alpha = CLAMP (src_alpha, 0, G_MAXUINT16);
+      src_alpha_inv = G_MAXUINT16 - src_alpha;
+
+      for (k = 0; k < G_N_ELEMENTS (src_comp); k++)
+        src_comp[k] *= src_alpha;
+
+      /* calc destination alpha as alpha_factor = (255-alpha_s) * alpha_factor / factor */
+      alpha_factor = dst_val & comp_mask_alpha;
+      alpha_factor *= src_alpha_inv;
+      alpha_factor /= G_MAXUINT16;
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++)
+        dst_comp[k] *= alpha_factor;
+
+      /* calc final pixel as pix_d = pix_s*alpha_s + pix_d*alpha_factor*(255-alpha_s)/255 */
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++)
+        dst_comp[k] += src_comp[k];
+
+      /* calc the alpha factor alpha_factor = alpha_s + alpha_factor * (255-alpha_s)/255 */
+      alpha_factor += src_alpha;
+      alpha_factor = CLAMP (alpha_factor, 0, G_MAXUINT16);
+
+      /* now normalize the pix_d by the final alpha to make it associative */
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++) {
+        if (alpha_factor > 0)
+          dst_comp[k] /= alpha_factor;
+        dst_comp[k] = CLAMP (dst_comp[k], 0, G_MAXUINT16);
+      }
+
+      /* calc the final global alpha_d = alpha_d + (alpha_s * (alpha / 255)) */
+      dst_alpha = dst_val & comp_mask_alpha;
+      dst_alpha += src_alpha;
+      dst_alpha = CLAMP (dst_alpha, 0, G_MAXUINT16);
+
+      dst_val = (dst_comp[0] << 48) | (dst_comp[1] << 32) | (dst_comp[2] << 16)
+          | dst_alpha;
+      dst[i] = dst_val;
+    }
+  }
+}
+#else /* if G_BYTE_ORDER == G_LITTLE_ENDIAN */
+static void
+compositor_blend_bgra64 (guint8 * ORC_RESTRICT d1, int d1_stride,
+    const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int n, int m)
+{
+  gint i, j, k;
+  const guint64 comp_mask_0 = 0x000000000000ffff;
+  const guint64 comp_mask_1 = 0x00000000ffff0000;
+  const guint64 comp_mask_2 = 0x0000ffff00000000;
+  const guint64 comp_mask_alpha = 0xffff000000000000;
+
+  for (j = 0; j < m; j++) {
+    guint64 *dst;
+    guint64 *src;
+
+    dst = (guint64 *) (d1 + (d1_stride * j));
+    src = (guint64 *) (s1 + (s1_stride * j));
+
+    for (i = 0; i < n; i++) {
+      guint64 src_val = src[i];
+      guint64 dst_val = dst[i];
+      guint64 src_comp[3];
+      guint64 dst_comp[3];
+      guint64 src_alpha;
+      guint64 src_alpha_inv;
+
+      src_comp[0] = src_val & comp_mask_0;
+      src_comp[1] = (src_val & comp_mask_1) >> 16;
+      src_comp[2] = (src_val & comp_mask_2) >> 32;
+
+      dst_comp[0] = dst_val & comp_mask_0;
+      dst_comp[1] = (dst_val & comp_mask_1) >> 16;
+      dst_comp[2] = (dst_val & comp_mask_2) >> 32;
+
+      src_alpha = (src_val & comp_mask_alpha) >> 48;
+      src_alpha *= p1;
+      src_alpha /= G_MAXUINT16;
+      src_alpha = CLAMP (src_alpha, 0, G_MAXUINT16);
+      src_alpha_inv = G_MAXUINT16 - src_alpha;
+
+      for (k = 0; k < G_N_ELEMENTS (src_comp); k++) {
+        src_comp[k] *= src_alpha;
+        dst_comp[k] *= src_alpha_inv;
+        dst_comp[k] += src_comp[k];
+        dst_comp[k] /= G_MAXUINT16;
+
+        dst_comp[k] = CLAMP (dst_comp[k], 0, G_MAXUINT16);
+      }
+
+      dst_val = (dst_comp[0]) | (dst_comp[1] << 16) | (dst_comp[2] << 32)
+          | comp_mask_alpha;
+      dst[i] = dst_val;
+    }
+  }
+}
+
+static void
+compositor_source_bgra64 (guint8 * ORC_RESTRICT d1, int d1_stride,
+    const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int n, int m)
+{
+  gint i, j;
+  const guint64 comp_mask_non_alpha = 0x0000ffffffffffff;
+  const guint64 comp_mask_alpha = 0xffff000000000000;
+
+  for (j = 0; j < m; j++) {
+    guint64 *dst;
+    guint64 *src;
+
+    dst = (guint64 *) (d1 + (d1_stride * j));
+    src = (guint64 *) (s1 + (s1_stride * j));
+
+    for (i = 0; i < n; i++) {
+      guint64 src_val = src[i];
+      guint64 dst_val;
+      guint64 src_alpha;
+
+      src_alpha = (src_val & comp_mask_alpha) >> 48;
+      src_alpha *= p1;
+      src_alpha /= G_MAXUINT16;
+      src_alpha = CLAMP (src_alpha, 0, G_MAXUINT16);
+      src_alpha <<= 48;
+
+      dst_val = (src_val & comp_mask_non_alpha) | src_alpha;
+      dst[i] = dst_val;
+    }
+  }
+}
+
+static void
+compositor_overlay_bgra64 (guint8 * ORC_RESTRICT d1, int d1_stride,
+    const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int n, int m)
+{
+  gint i, j, k;
+  const guint64 comp_mask_0 = 0x000000000000ffff;
+  const guint64 comp_mask_1 = 0x00000000ffff0000;
+  const guint64 comp_mask_2 = 0x0000ffff00000000;
+  const guint64 comp_mask_alpha = 0xffff000000000000;
+
+  for (j = 0; j < m; j++) {
+    guint64 *dst;
+    guint64 *src;
+
+    dst = (guint64 *) (d1 + (d1_stride * j));
+    src = (guint64 *) (s1 + (s1_stride * j));
+
+    for (i = 0; i < n; i++) {
+      guint64 src_val = src[i];
+      guint64 dst_val = dst[i];
+      guint64 src_comp[3];
+      guint64 dst_comp[3];
+      guint64 src_alpha;
+      guint64 src_alpha_inv;
+      guint64 dst_alpha;
+
+      src_comp[0] = src_val & comp_mask_0;
+      src_comp[1] = (src_val & comp_mask_1) >> 16;
+      src_comp[2] = (src_val & comp_mask_2) >> 32;
+
+      dst_comp[0] = dst_val & comp_mask_0;
+      dst_comp[1] = (dst_val & comp_mask_1) >> 16;
+      dst_comp[2] = (dst_val & comp_mask_2) >> 32;
+
+      /* calc source alpha as alpha_s = alpha_s * alpha / 255 */
+      src_alpha = (src_val & comp_mask_alpha) >> 48;
+      src_alpha *= p1;
+      src_alpha /= G_MAXUINT16;
+      src_alpha = CLAMP (src_alpha, 0, G_MAXUINT16);
+      src_alpha_inv = G_MAXUINT16 - src_alpha;
+
+      for (k = 0; k < G_N_ELEMENTS (src_comp); k++)
+        src_comp[k] *= src_alpha;
+      /* calc destination alpha as alpha_d = (1.0 - alpha_s) * alpha_d / 1.0 */
+      dst_alpha = (dst_val & comp_mask_alpha) >> 48;
+      dst_alpha *= src_alpha_inv;
+      dst_alpha /= G_MAXUINT16;
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++)
+        dst_comp[k] *= dst_alpha;
+
+      /* calc final pixel as pix_d = pix_s*alpha_s + pix_d*alpha_d*(255-alpha_s)/255 */
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++)
+        dst_comp[k] += src_comp[k];
+
+      /* calc the final destination alpha_d = alpha_s + alpha_d * (255-alpha_s)/255 */
+      dst_alpha += src_alpha;
+      dst_alpha = CLAMP (dst_alpha, 0, G_MAXUINT16);
+
+      /* now normalize the pix_d by the final alpha to make it associative */
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++) {
+        if (dst_alpha > 0)
+          dst_comp[k] /= dst_alpha;
+        dst_comp[k] = CLAMP (dst_comp[k], 0, G_MAXUINT16);
+      }
+
+      dst_val = (dst_comp[0]) | (dst_comp[1] << 16) | (dst_comp[2] << 32)
+          | (dst_alpha << 48);
+      dst[i] = dst_val;
+    }
+  }
+}
+
+static void
+compositor_overlay_bgra64_addition (guint8 * ORC_RESTRICT d1, int d1_stride,
+    const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int n, int m)
+{
+  gint i, j, k;
+  const guint64 comp_mask_0 = 0x000000000000ffff;
+  const guint64 comp_mask_1 = 0x00000000ffff0000;
+  const guint64 comp_mask_2 = 0x0000ffff00000000;
+  const guint64 comp_mask_alpha = 0xffff000000000000;
+
+  for (j = 0; j < m; j++) {
+    guint64 *dst;
+    guint64 *src;
+
+    dst = (guint64 *) (d1 + (d1_stride * j));
+    src = (guint64 *) (s1 + (s1_stride * j));
+
+    for (i = 0; i < n; i++) {
+      guint64 src_val = src[i];
+      guint64 dst_val = dst[i];
+      guint64 src_comp[3];
+      guint64 dst_comp[3];
+      guint64 src_alpha;
+      guint64 src_alpha_inv;
+      guint64 alpha_factor;
+      guint64 dst_alpha;
+
+      src_comp[0] = src_val & comp_mask_0;
+      src_comp[1] = (src_val & comp_mask_1) >> 16;
+      src_comp[2] = (src_val & comp_mask_2) >> 32;
+
+      dst_comp[0] = dst_val & comp_mask_0;
+      dst_comp[1] = (dst_val & comp_mask_1) >> 16;
+      dst_comp[2] = (dst_val & comp_mask_2) >> 32;
+
+      /* calc source alpha as alpha_s = alpha_s * alpha / 255 */
+      src_alpha = (src_val & comp_mask_alpha) >> 48;
+      src_alpha *= p1;
+      src_alpha /= G_MAXUINT16;
+      src_alpha = CLAMP (src_alpha, 0, G_MAXUINT16);
+      src_alpha_inv = G_MAXUINT16 - src_alpha;
+
+      for (k = 0; k < G_N_ELEMENTS (src_comp); k++)
+        src_comp[k] *= src_alpha;
+
+      /* calc destination alpha as alpha_factor = (255-alpha_s) * alpha_factor / factor */
+      alpha_factor = (dst_val & comp_mask_alpha) >> 48;
+      alpha_factor *= src_alpha_inv;
+      alpha_factor /= G_MAXUINT16;
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++)
+        dst_comp[k] *= alpha_factor;
+
+      /* calc final pixel as pix_d = pix_s*alpha_s + pix_d*alpha_factor*(255-alpha_s)/255 */
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++)
+        dst_comp[k] += src_comp[k];
+
+      /* calc the alpha factor alpha_factor = alpha_s + alpha_factor * (255-alpha_s)/255 */
+      alpha_factor += src_alpha;
+      alpha_factor = CLAMP (alpha_factor, 0, G_MAXUINT16);
+
+      /* now normalize the pix_d by the final alpha to make it associative */
+      for (k = 0; k < G_N_ELEMENTS (dst_comp); k++) {
+        if (alpha_factor > 0)
+          dst_comp[k] /= alpha_factor;
+        dst_comp[k] = CLAMP (dst_comp[k], 0, G_MAXUINT16);
+      }
+
+      /* calc the final global alpha_d = alpha_d + (alpha_s * (alpha / 255)) */
+      dst_alpha = (dst_val & comp_mask_alpha) >> 48;
+      dst_alpha += src_alpha;
+      dst_alpha = CLAMP (dst_alpha, 0, G_MAXUINT16);
+
+      dst_val = (dst_comp[0]) | (dst_comp[1] << 16) | (dst_comp[2] << 32)
+          | (dst_alpha << 48);
+      dst[i] = dst_val;
+    }
+  }
+}
+#endif /* if G_BYTE_ORDER == G_LITTLE_ENDIAN */
+
+/* for AYUV64, ARGB64 */
+#define BLEND_A64(name, method, LOOP) \
+static void \
+method##_ ##name (GstVideoFrame * srcframe, gint xpos, gint ypos, \
+    gdouble src_alpha, GstVideoFrame * destframe, gint dst_y_start, \
+    gint dst_y_end, GstCompositorBlendMode mode) \
+{ \
+  guint s_alpha; \
+  gint src_stride, dest_stride; \
+  gint dest_width, dest_height; \
+  guint8 *src, *dest; \
+  gint src_width, src_height; \
+  \
+  src_width = GST_VIDEO_FRAME_WIDTH (srcframe); \
+  src_height = GST_VIDEO_FRAME_HEIGHT (srcframe); \
+  src = GST_VIDEO_FRAME_PLANE_DATA (srcframe, 0); \
+  src_stride = GST_VIDEO_FRAME_COMP_STRIDE (srcframe, 0); \
+  dest = GST_VIDEO_FRAME_PLANE_DATA (destframe, 0); \
+  dest_stride = GST_VIDEO_FRAME_COMP_STRIDE (destframe, 0); \
+  dest_width = GST_VIDEO_FRAME_COMP_WIDTH (destframe, 0); \
+  dest_height = GST_VIDEO_FRAME_COMP_HEIGHT (destframe, 0); \
+  \
+  s_alpha = CLAMP ((gint) (src_alpha * G_MAXUINT16), 0, G_MAXUINT16); \
+  \
+  /* If it's completely transparent... we just return */ \
+  if (G_UNLIKELY (s_alpha == 0)) \
+    return; \
+  \
+  if (dst_y_end > dest_height) { \
+    dst_y_end = dest_height; \
+  } \
+  /* adjust src pointers for negative sizes */ \
+  if (xpos < 0) { \
+    src += -xpos * 8; \
+    src_width -= -xpos; \
+    xpos = 0; \
+  } \
+  if (ypos < dst_y_start) { \
+    src += (dst_y_start - ypos) * src_stride; \
+    src_height -= dst_y_start - ypos; \
+    ypos = dst_y_start; \
+  } \
+  /* adjust width/height if the src is bigger than dest */ \
+  if (xpos + src_width > dest_width) { \
+    src_width = dest_width - xpos; \
+  } \
+  if (ypos + src_height > dst_y_end) { \
+    src_height = dst_y_end - ypos; \
+  } \
+  \
+  if (src_height > 0 && src_width > 0) { \
+    dest = dest + 8 * xpos + (ypos * dest_stride); \
+  \
+    LOOP (dest, src, src_height, src_width, src_stride, dest_stride, s_alpha, \
+        mode); \
+  } \
+}
+
+#define OVERLAY_A64_LOOP(name)  \
+static inline void \
+_overlay_loop_##name (guint8 * dest, const guint8 * src, gint src_height, \
+    gint src_width, gint src_stride, gint dest_stride, guint s_alpha, \
+    GstCompositorBlendMode mode) \
+{ \
+  s_alpha = MIN (G_MAXUINT16, s_alpha); \
+  switch (mode) { \
+    case COMPOSITOR_BLEND_MODE_SOURCE:\
+      if (s_alpha == G_MAXUINT16) { \
+        guint y; \
+        for (y = 0; y < src_height; y++) { \
+          memcpy (dest, src, 8 * src_width); \
+          dest += dest_stride; \
+          src += src_stride; \
+        } \
+      } else { \
+        compositor_source_##name (dest, dest_stride, src, src_stride, \
+          s_alpha, src_width, src_height); \
+      } \
+      break;\
+    case COMPOSITOR_BLEND_MODE_OVER:\
+      compositor_overlay_##name (dest, dest_stride, src, src_stride, \
+        s_alpha, src_width, src_height); \
+      break;\
+    case COMPOSITOR_BLEND_MODE_ADD:\
+      compositor_overlay_##name##_addition (dest, dest_stride, src, src_stride, \
+        s_alpha, src_width, src_height); \
+      break;\
+  }\
+}
+
+#define BLEND_A64_LOOP(name) \
+static inline void \
+_blend_loop_##name (guint8 * dest, const guint8 * src, gint src_height, \
+    gint src_width, gint src_stride, gint dest_stride, guint s_alpha, \
+    GstCompositorBlendMode mode) \
+{ \
+  s_alpha = MIN (G_MAXUINT16, s_alpha); \
+  switch (mode) { \
+    case COMPOSITOR_BLEND_MODE_SOURCE:\
+      if (s_alpha == G_MAXUINT16) { \
+        guint y; \
+        for (y = 0; y < src_height; y++) { \
+          memcpy (dest, src, 8 * src_width); \
+          dest += dest_stride; \
+          src += src_stride; \
+        } \
+      } else { \
+        compositor_source_##name (dest, dest_stride, src, src_stride, \
+          s_alpha, src_width, src_height); \
+      } \
+      break;\
+    case COMPOSITOR_BLEND_MODE_OVER:\
+    case COMPOSITOR_BLEND_MODE_ADD:\
+      /* both modes are the same for opaque background */ \
+      compositor_blend_##name (dest, dest_stride, src, src_stride, \
+        s_alpha, src_width, src_height); \
+      break;\
+  }\
+}
+
+#if G_BYTE_ORDER == G_LITTLE_ENDIAN
+OVERLAY_A64_LOOP (argb64);
+BLEND_A64_LOOP (argb64);
+BLEND_A64 (argb64, blend, _blend_loop_argb64);
+BLEND_A64 (argb64, overlay, _overlay_loop_argb64);
+#else
+OVERLAY_A64_LOOP (bgra64);
+BLEND_A64_LOOP (bgra64);
+BLEND_A64 (argb64, blend, _blend_loop_bgra64);
+BLEND_A64 (argb64, overlay, _overlay_loop_bgra64);
+#endif
+
+#define A64_CHECKER_C(name, RGB, A, C1, C2, C3) \
+static void \
+fill_checker_##name##_c (GstVideoFrame * frame, guint y_start, guint y_end) \
+{ \
+  gint i, j; \
+  gint val; \
+  static const gint tab[] = { 20480, 40960, 20480, 40960 }; \
+  static const gint uv = 1 << 15; \
+  gint width, stride; \
+  guint8 *dest; \
+  \
+  dest = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); \
+  width = GST_VIDEO_FRAME_COMP_WIDTH (frame, 0); \
+  stride = GST_VIDEO_FRAME_COMP_STRIDE (frame, 0); \
+  \
+  if (!RGB) { \
+    for (i = y_start; i < y_end; i++) { \
+      guint16 *data = (guint16 *) (dest + i * stride); \
+      for (j = 0; j < width; j++) { \
+        data[A] = 0xffff; \
+        data[C1] = tab[((i & 0x8) >> 3) + ((j & 0x8) >> 3)]; \
+        data[C2] = uv; \
+        data[C3] = uv; \
+        data += 4; \
+      } \
+    } \
+  } else { \
+    for (i = y_start; i < y_end; i++) { \
+      guint16 *data = (guint16 *) (dest + i * stride); \
+      for (j = 0; j < width; j++) { \
+        val = tab[((i & 0x8) >> 3) + ((j & 0x8) >> 3)]; \
+        data[A] = 0xffff; \
+        data[C1] = val; \
+        data[C2] = val; \
+        data[C3] = val; \
+        data += 4; \
+      } \
+    } \
+  } \
+}
+
+A64_CHECKER_C (argb64, TRUE, 0, 1, 2, 3);
+A64_CHECKER_C (ayuv64, FALSE, 0, 1, 2, 3);
+
+#define A64_COLOR(name, A, C1, C2, C3) \
+static void \
+fill_color_##name (GstVideoFrame * frame, guint y_start, guint y_end, gint c1, gint c2, gint c3) \
+{ \
+  gint i, j; \
+  gint stride; \
+  guint8 *dest; \
+  guint width; \
+  guint height; \
+  \
+  height = y_end - y_start; \
+  if (height <= 0) \
+    return; \
+  \
+  dest = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); \
+  stride = GST_VIDEO_FRAME_COMP_STRIDE (frame, 0); \
+  width = GST_VIDEO_FRAME_WIDTH (frame); \
+  \
+  for (i = y_start; i < y_end; i++) { \
+    guint16 *data = (guint16 *) (dest + i * stride); \
+    for (j = 0; j < width; j++) { \
+      data[A] = 0xffff; \
+      data[C1] = c1; \
+      data[C2] = c2; \
+      data[C3] = c3; \
+      data += 4; \
+    } \
+  } \
+}
+
+A64_COLOR (argb64, 0, 1, 2, 3);
+
 /* NV12, NV21 */
 #define NV_YUV_BLEND(format_name,MEMCPY,BLENDLOOP) \
 inline static void \
@@ -1270,6 +1957,9 @@ BlendFunction gst_compositor_blend_y444_12le;
 BlendFunction gst_compositor_blend_y444_12be;
 BlendFunction gst_compositor_blend_y444_16le;
 BlendFunction gst_compositor_blend_y444_16be;
+BlendFunction gst_compositor_blend_argb64;
+BlendFunction gst_compositor_overlay_argb64;
+/* AYUV64 is equal to ARGB64 */
 
 FillCheckerFunction gst_compositor_fill_checker_argb;
 FillCheckerFunction gst_compositor_fill_checker_bgra;
@@ -1297,6 +1987,8 @@ FillCheckerFunction gst_compositor_fill_checker_i420_12le;
 FillCheckerFunction gst_compositor_fill_checker_i420_12be;
 FillCheckerFunction gst_compositor_fill_checker_y444_16le;
 FillCheckerFunction gst_compositor_fill_checker_y444_16be;
+FillCheckerFunction gst_compositor_fill_checker_argb64;
+FillCheckerFunction gst_compositor_fill_checker_ayuv64;
 
 FillColorFunction gst_compositor_fill_color_argb;
 FillColorFunction gst_compositor_fill_color_bgra;
@@ -1326,6 +2018,7 @@ FillColorFunction gst_compositor_fill_color_i420_12le;
 FillColorFunction gst_compositor_fill_color_i420_12be;
 FillColorFunction gst_compositor_fill_color_y444_16le;
 FillColorFunction gst_compositor_fill_color_y444_16be;
+FillColorFunction gst_compositor_fill_color_argb64;
 
 void
 gst_compositor_init_blend (void)
@@ -1360,6 +2053,8 @@ gst_compositor_init_blend (void)
   gst_compositor_blend_y444_12be = GST_DEBUG_FUNCPTR (blend_y444_12be);
   gst_compositor_blend_y444_16le = GST_DEBUG_FUNCPTR (blend_y444_16le);
   gst_compositor_blend_y444_16be = GST_DEBUG_FUNCPTR (blend_y444_16be);
+  gst_compositor_blend_argb64 = GST_DEBUG_FUNCPTR (blend_argb64);
+  gst_compositor_overlay_argb64 = GST_DEBUG_FUNCPTR (overlay_argb64);
 
   gst_compositor_fill_checker_argb = GST_DEBUG_FUNCPTR (fill_checker_argb_c);
   gst_compositor_fill_checker_bgra = GST_DEBUG_FUNCPTR (fill_checker_bgra_c);
@@ -1388,6 +2083,10 @@ gst_compositor_init_blend (void)
       GST_DEBUG_FUNCPTR (fill_checker_y444_16le);
   gst_compositor_fill_checker_y444_16be =
       GST_DEBUG_FUNCPTR (fill_checker_y444_16be);
+  gst_compositor_fill_checker_argb64 =
+      GST_DEBUG_FUNCPTR (fill_checker_argb64_c);
+  gst_compositor_fill_checker_ayuv64 =
+      GST_DEBUG_FUNCPTR (fill_checker_ayuv64_c);
 
   gst_compositor_fill_color_argb = GST_DEBUG_FUNCPTR (fill_color_argb);
   gst_compositor_fill_color_bgra = GST_DEBUG_FUNCPTR (fill_color_bgra);
@@ -1422,4 +2121,5 @@ gst_compositor_init_blend (void)
       GST_DEBUG_FUNCPTR (fill_color_y444_16le);
   gst_compositor_fill_color_y444_16be =
       GST_DEBUG_FUNCPTR (fill_color_y444_16be);
+  gst_compositor_fill_color_argb64 = GST_DEBUG_FUNCPTR (fill_color_argb64);
 }
diff --git a/subprojects/gst-plugins-base/gst/compositor/blend.h b/subprojects/gst-plugins-base/gst/compositor/blend.h
index 6761acb..8f64243 100644
--- a/subprojects/gst-plugins-base/gst/compositor/blend.h
+++ b/subprojects/gst-plugins-base/gst/compositor/blend.h
@@ -65,6 +65,9 @@ extern BlendFunction gst_compositor_overlay_bgra;
 #define gst_compositor_overlay_vuya gst_compositor_overlay_bgra
 #define gst_compositor_overlay_abgr gst_compositor_overlay_argb
 #define gst_compositor_overlay_rgba gst_compositor_overlay_bgra
+extern BlendFunction gst_compositor_overlay_argb64;
+#define gst_compositor_overlay_ayuv64 gst_compositor_overlay_argb64;
+
 extern BlendFunction gst_compositor_blend_i420;
 #define gst_compositor_blend_yv12 gst_compositor_blend_i420
 extern BlendFunction gst_compositor_blend_nv12;
@@ -95,6 +98,9 @@ extern BlendFunction gst_compositor_blend_y444_12le;
 extern BlendFunction gst_compositor_blend_y444_12be;
 extern BlendFunction gst_compositor_blend_y444_16le;
 extern BlendFunction gst_compositor_blend_y444_16be;
+extern BlendFunction gst_compositor_blend_argb64;
+#define gst_compositor_blend_ayuv64 gst_compositor_blend_argb64;
+
 
 extern FillCheckerFunction gst_compositor_fill_checker_argb;
 #define gst_compositor_fill_checker_abgr gst_compositor_fill_checker_argb
@@ -132,6 +138,8 @@ extern FillCheckerFunction gst_compositor_fill_checker_i420_12be;
 #define gst_compositor_fill_checker_y444_12be gst_compositor_fill_checker_i420_12be
 extern FillCheckerFunction gst_compositor_fill_checker_y444_16le;
 extern FillCheckerFunction gst_compositor_fill_checker_y444_16be;
+extern FillCheckerFunction gst_compositor_fill_checker_argb64;
+extern FillCheckerFunction gst_compositor_fill_checker_ayuv64;
 
 extern FillColorFunction gst_compositor_fill_color_argb;
 extern FillColorFunction gst_compositor_fill_color_abgr;
@@ -169,6 +177,8 @@ extern FillColorFunction gst_compositor_fill_color_i420_12be;
 #define gst_compositor_fill_color_y444_12be gst_compositor_fill_color_i420_12be
 extern FillColorFunction gst_compositor_fill_color_y444_16le;
 extern FillColorFunction gst_compositor_fill_color_y444_16be;
+extern FillColorFunction gst_compositor_fill_color_argb64;
+#define gst_compositor_fill_color_ayuv64 gst_compositor_fill_color_argb64;
 
 void gst_compositor_init_blend (void);
 
diff --git a/subprojects/gst-plugins-base/gst/compositor/compositor.c b/subprojects/gst-plugins-base/gst/compositor/compositor.c
index e94239d..8763780 100644
--- a/subprojects/gst-plugins-base/gst/compositor/compositor.c
+++ b/subprojects/gst-plugins-base/gst/compositor/compositor.c
@@ -103,28 +103,10 @@
 GST_DEBUG_CATEGORY_STATIC (gst_compositor_debug);
 #define GST_CAT_DEFAULT gst_compositor_debug
 
-#if G_BYTE_ORDER == G_LITTLE_ENDIAN
-#define FORMATS " { AYUV, VUYA, BGRA, ARGB, RGBA, ABGR, " \
-                "   Y444_16LE, Y444_16BE, Y444_12LE, Y444_12BE, Y444_10LE, Y444_10BE, " \
-                "   Y444, Y42B, YUY2, UYVY, YVYU, "\
-                "   I422_12LE, I422_12BE, I422_10LE, I422_10BE, "\
-                "   I420_12LE, I420_12BE, I420_10LE, I420_10BE, " \
-                "   I420, YV12, NV12, NV21, Y41B, RGB, BGR, xRGB, xBGR, "\
-                "   RGBx, BGRx } "
-#else
-#define FORMATS " { AYUV, VUYA, BGRA, ARGB, RGBA, ABGR, "\
-                "   Y444_16BE, Y444_16LE, Y444_12BE, Y444_12LE, Y444_10BE, Y444_10LE, " \
-                "   Y444, Y42B, YUY2, UYVY, YVYU, "\
-                "   I422_12BE, I422_12LE, I422_10BE, I422_10LE, "\
-                "   I420_12BE, I420_12LE, I420_10BE, I420_10LE, "\
-                "   I420, YV12, NV12, NV21, Y41B, RGB, BGR, xRGB, xBGR, "\
-                "   RGBx, BGRx } "
-#endif
-
 static GstStaticPadTemplate src_factory = GST_STATIC_PAD_TEMPLATE ("src",
     GST_PAD_SRC,
     GST_PAD_ALWAYS,
-    GST_STATIC_CAPS (GST_VIDEO_CAPS_MAKE (FORMATS))
+    GST_STATIC_CAPS (GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL))
     );
 
 static GstStaticPadTemplate sink_factory = GST_STATIC_PAD_TEMPLATE ("sink_%u",
@@ -622,6 +604,7 @@ static void
 gst_compositor_pad_create_conversion_info (GstVideoAggregatorConvertPad * pad,
     GstVideoAggregator * vagg, GstVideoInfo * conversion_info)
 {
+  GstCompositor *self = GST_COMPOSITOR (vagg);
   GstCompositorPad *cpad = GST_COMPOSITOR_PAD (pad);
   gint width, height;
   gint x_offset, y_offset;
@@ -632,7 +615,24 @@ gst_compositor_pad_create_conversion_info (GstVideoAggregatorConvertPad * pad,
   if (!conversion_info->finfo)
     return;
 
-  _mixer_pad_get_output_size (GST_COMPOSITOR (vagg), cpad,
+  /* Need intermediate conversion? */
+  if (self->intermediate_frame) {
+    GstVideoInfo intermediate_info;
+    gst_video_info_set_interlaced_format (&intermediate_info,
+        GST_VIDEO_INFO_FORMAT (&self->intermediate_info),
+        conversion_info->interlace_mode,
+        GST_VIDEO_INFO_WIDTH (conversion_info),
+        GST_VIDEO_INFO_HEIGHT (conversion_info));
+    intermediate_info.colorimetry = conversion_info->colorimetry;
+    intermediate_info.par_n = conversion_info->par_n;
+    intermediate_info.par_d = conversion_info->par_d;
+    intermediate_info.fps_n = conversion_info->fps_n;
+    intermediate_info.fps_d = conversion_info->fps_d;
+    intermediate_info.flags = conversion_info->flags;
+    *conversion_info = intermediate_info;
+  }
+
+  _mixer_pad_get_output_size (self, cpad,
       GST_VIDEO_INFO_PAR_N (&vagg->info), GST_VIDEO_INFO_PAR_D (&vagg->info),
       &width, &height, &x_offset, &y_offset);
 
@@ -647,8 +647,9 @@ gst_compositor_pad_create_conversion_info (GstVideoAggregatorConvertPad * pad,
      * colorimetry, and chroma-site and our current pixel-aspect-ratio
      * and other relevant fields.
      */
-    gst_video_info_set_format (&tmp_info,
-        GST_VIDEO_INFO_FORMAT (conversion_info), width, height);
+    gst_video_info_set_interlaced_format (&tmp_info,
+        GST_VIDEO_INFO_FORMAT (conversion_info),
+        conversion_info->interlace_mode, width, height);
     tmp_info.chroma_site = conversion_info->chroma_site;
     tmp_info.colorimetry = conversion_info->colorimetry;
     tmp_info.par_n = conversion_info->par_n;
@@ -656,7 +657,6 @@ gst_compositor_pad_create_conversion_info (GstVideoAggregatorConvertPad * pad,
     tmp_info.fps_n = conversion_info->fps_n;
     tmp_info.fps_d = conversion_info->fps_d;
     tmp_info.flags = conversion_info->flags;
-    tmp_info.interlace_mode = conversion_info->interlace_mode;
 
     *conversion_info = tmp_info;
   }
@@ -818,11 +818,16 @@ set_functions (GstCompositor * self, const GstVideoInfo * info)
   gint scale[GST_VIDEO_MAX_COMPONENTS] = { 0, };
   gint i;
 
+  gst_clear_buffer (&self->intermediate_frame);
+  g_clear_pointer (&self->intermediate_convert, gst_video_converter_free);
+
   self->blend = NULL;
   self->overlay = NULL;
   self->fill_checker = NULL;
   self->fill_color = NULL;
 
+  self->intermediate_info = *info;
+
   switch (GST_VIDEO_INFO_FORMAT (info)) {
     case GST_VIDEO_FORMAT_AYUV:
       self->blend = gst_compositor_blend_ayuv;
@@ -1040,15 +1045,93 @@ set_functions (GstCompositor * self, const GstVideoInfo * info)
       self->fill_checker = gst_compositor_fill_checker_bgrx;
       self->fill_color = gst_compositor_fill_color_bgrx;
       break;
+    case GST_VIDEO_FORMAT_ARGB64:
+      self->blend = gst_compositor_blend_argb64;
+      self->overlay = gst_compositor_overlay_argb64;
+      self->fill_checker = gst_compositor_fill_checker_argb64;
+      self->fill_color = gst_compositor_fill_color_argb64;
+      break;
+    case GST_VIDEO_FORMAT_AYUV64:
+      self->blend = gst_compositor_blend_ayuv64;
+      self->overlay = gst_compositor_overlay_ayuv64;
+      self->fill_checker = gst_compositor_fill_checker_ayuv64;
+      self->fill_color = gst_compositor_fill_color_ayuv64;
+      break;
     default:
-      GST_ERROR_OBJECT (self, "Unhandled format %s",
+    {
+      GstVideoFormat format = GST_VIDEO_FORMAT_UNKNOWN;
+      GstVideoInfo *intermediate_info = &self->intermediate_info;
+      if (GST_VIDEO_INFO_IS_YUV (info)) {
+        if (GST_VIDEO_INFO_COMP_DEPTH (info, 0) == 8)
+          format = GST_VIDEO_FORMAT_AYUV;
+        else
+          format = GST_VIDEO_FORMAT_AYUV64;
+      } else {
+        if (GST_VIDEO_INFO_COMP_DEPTH (info, 0) == 8)
+          format = GST_VIDEO_FORMAT_ARGB;
+        else
+          format = GST_VIDEO_FORMAT_ARGB64;
+      }
+
+      switch (format) {
+        case GST_VIDEO_FORMAT_AYUV:
+          self->blend = gst_compositor_blend_ayuv;
+          self->overlay = gst_compositor_overlay_ayuv;
+          self->fill_checker = gst_compositor_fill_checker_ayuv;
+          self->fill_color = gst_compositor_fill_color_ayuv;
+          break;
+        case GST_VIDEO_FORMAT_AYUV64:
+          self->blend = gst_compositor_blend_ayuv64;
+          self->overlay = gst_compositor_overlay_ayuv64;
+          self->fill_checker = gst_compositor_fill_checker_ayuv64;
+          self->fill_color = gst_compositor_fill_color_ayuv64;
+          break;
+        case GST_VIDEO_FORMAT_ARGB:
+          self->blend = gst_compositor_blend_argb;
+          self->overlay = gst_compositor_overlay_argb;
+          self->fill_checker = gst_compositor_fill_checker_argb;
+          self->fill_color = gst_compositor_fill_color_argb;
+          break;
+        case GST_VIDEO_FORMAT_ARGB64:
+          self->blend = gst_compositor_blend_argb64;
+          self->overlay = gst_compositor_overlay_argb64;
+          self->fill_checker = gst_compositor_fill_checker_argb64;
+          self->fill_color = gst_compositor_fill_color_argb64;
+          break;
+        default:
+          GST_ERROR_OBJECT (self, "Unhandled format %s -> %s",
+              gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (info)),
+              gst_video_format_to_string (format));
+          return FALSE;
+      }
+
+      GST_DEBUG_OBJECT (self,
+          "Configured intermediate format %s for output format %s",
+          gst_video_format_to_string (format),
           gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (info)));
-      return FALSE;
+
+      /* needs intermediate conversion */
+      gst_video_info_set_interlaced_format (intermediate_info,
+          format, info->interlace_mode, info->width, info->height);
+      intermediate_info->par_n = info->par_n;
+      intermediate_info->par_d = info->par_d;
+      intermediate_info->fps_n = info->fps_n;
+      intermediate_info->fps_d = info->fps_d;
+      intermediate_info->flags = info->flags;
+
+      /* preserve colorimetry if required */
+      if (!GST_VIDEO_INFO_IS_GRAY (info))
+        intermediate_info->colorimetry = info->colorimetry;
+
+      self->intermediate_frame =
+          gst_buffer_new_and_alloc (self->intermediate_info.size);
+      break;
+    }
   }
 
   /* calculate black and white colors */
-  gst_video_color_range_offsets (info->colorimetry.range, info->finfo,
-      offset, scale);
+  gst_video_color_range_offsets (self->intermediate_info.colorimetry.range,
+      self->intermediate_info.finfo, offset, scale);
   if (GST_VIDEO_INFO_IS_YUV (info)) {
     /* black color [0.0, 0.0, 0.0] */
     self->black_color[0] = offset[0];
@@ -1341,10 +1424,37 @@ _negotiated_caps (GstAggregator * agg, GstCaps * caps)
     gst_clear_object (&pool);
   }
 
+  if (compositor->intermediate_frame) {
+    GstStructure *config = NULL;
+    GstTaskPool *pool = gst_video_aggregator_get_execution_task_pool (vagg);
+
+    if (pool && n_threads > 1) {
+      config = gst_structure_new_empty ("GstVideoConverterConfig");
+      gst_structure_set (config, GST_VIDEO_CONVERTER_OPT_THREADS,
+          G_TYPE_UINT, n_threads, NULL);
+    }
+
+    compositor->intermediate_convert =
+        gst_video_converter_new_with_pool (&compositor->intermediate_info,
+        &v_info, config, pool);
+    gst_clear_object (&pool);
+  }
+
   return GST_AGGREGATOR_CLASS (parent_class)->negotiated_src_caps (agg, caps);
 }
 
 static gboolean
+gst_composior_stop (GstAggregator * agg)
+{
+  GstCompositor *self = GST_COMPOSITOR (agg);
+
+  gst_clear_buffer (&self->intermediate_frame);
+  g_clear_pointer (&self->intermediate_convert, gst_video_converter_free);
+
+  return GST_AGGREGATOR_CLASS (parent_class)->stop (agg);
+}
+
+static gboolean
 _should_draw_background (GstVideoAggregator * vagg)
 {
   GstVideoRectangle bg_rect;
@@ -1490,7 +1600,7 @@ gst_compositor_aggregate_frames (GstVideoAggregator * vagg, GstBuffer * outbuf)
 {
   GstCompositor *compositor = GST_COMPOSITOR (vagg);
   GList *l;
-  GstVideoFrame out_frame, *outframe;
+  GstVideoFrame out_frame, intermediate_frame, *outframe;
   gboolean draw_background;
   guint drawn_a_pad = FALSE;
   struct CompositePadInfo *pads_info;
@@ -1503,6 +1613,18 @@ gst_compositor_aggregate_frames (GstVideoAggregator * vagg, GstBuffer * outbuf)
 
   outframe = &out_frame;
 
+  if (compositor->intermediate_frame) {
+    if (!gst_video_frame_map (&intermediate_frame,
+            &compositor->intermediate_info, compositor->intermediate_frame,
+            GST_MAP_READWRITE)) {
+      GST_WARNING_OBJECT (vagg, "Could not map intermediate buffer");
+      gst_video_frame_unmap (&out_frame);
+      return GST_FLOW_ERROR;
+    }
+
+    outframe = &intermediate_frame;
+  }
+
   /* If one of the frames to be composited completely obscures the background,
    * don't bother drawing the background at all. We can also always use the
    * 'blend' BlendFunction in that case because it only changes if we have to
@@ -1603,7 +1725,14 @@ gst_compositor_aggregate_frames (GstVideoAggregator * vagg, GstBuffer * outbuf)
 
   GST_OBJECT_UNLOCK (vagg);
 
-  gst_video_frame_unmap (outframe);
+  if (compositor->intermediate_frame) {
+    gst_video_converter_frame (compositor->intermediate_convert,
+        &intermediate_frame, &out_frame);
+
+    gst_video_frame_unmap (&intermediate_frame);
+  }
+
+  gst_video_frame_unmap (&out_frame);
 
   return GST_FLOW_OK;
 }
@@ -1797,6 +1926,7 @@ gst_compositor_class_init (GstCompositorClass * klass)
   agg_class->src_event = _src_event;
   agg_class->fixate_src_caps = _fixate_caps;
   agg_class->negotiated_src_caps = _negotiated_caps;
+  agg_class->stop = GST_DEBUG_FUNCPTR (gst_composior_stop);
   videoaggregator_class->aggregate_frames = gst_compositor_aggregate_frames;
 
   g_object_class_install_property (gobject_class, PROP_BACKGROUND,
diff --git a/subprojects/gst-plugins-base/gst/compositor/compositor.h b/subprojects/gst-plugins-base/gst/compositor/compositor.h
index 57f46f9..c3d2998 100644
--- a/subprojects/gst-plugins-base/gst/compositor/compositor.h
+++ b/subprojects/gst-plugins-base/gst/compositor/compositor.h
@@ -149,6 +149,10 @@ struct _GstCompositor
   gint white_color[GST_VIDEO_MAX_COMPONENTS];
   gint black_color[GST_VIDEO_MAX_COMPONENTS];
 
+  GstBuffer *intermediate_frame;
+  GstVideoInfo intermediate_info;
+  GstVideoConverter *intermediate_convert;
+
   GstParallelizedTaskRunner *blend_runner;
 };
 
-- 
2.7.4