sys/nvcodec/cuda-converter.c

   1 /* GStreamer
   2  * Copyright (C) 2010 David Schleef <ds@schleef.org>
   3  * Copyright (C) 2010 Sebastian Dröge <sebastian.droege@collabora.co.uk>
   4  * Copyright (C) 2019 Seungha Yang <seungha.yang@navercorp.com>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Library General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Library General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Library General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
  19  * Boston, MA 02110-1301, USA.
  20  */
  21
  22 /**
  23  * SECTION:cudaconverter
  24  * @title: GstCudaConverter
  25  * @short_description: Generic video conversion using CUDA
  26  *
  27  * This object is used to convert video frames from one format to another.
  28  * The object can perform conversion of:
  29  *
  30  *  * video format
  31  *  * video colorspace
  32  *  * video size
  33  */
  34
  35 /**
  36  * TODO:
  37  *  * Add more interpolation method and make it selectable,
  38  *    currently default bi-linear interpolation only
  39  *  * Add fast-path for conversion like videoconvert
  40  *  * Full colorimetry and chroma-siting support
  41  *  * cropping, and x, y position support
  42  */
  43
  44 #ifdef HAVE_CONFIG_H
  45 #include "config.h"
  46 #endif
  47
  48 #include "cuda-converter.h"
  49 #include "gstcudautils.h"
  50 #include "gstcudaloader.h"
  51 #include "gstcudanvrtc.h"
  52 #include <string.h>
  53
  54 #define CUDA_BLOCK_X 16
  55 #define CUDA_BLOCK_Y 16
  56 #define DIV_UP(size,block) (((size) + ((block) - 1)) / (block))
  57
  58 static gboolean cuda_converter_lookup_path (GstCudaConverter * convert);
  59
  60 #ifndef GST_DISABLE_GST_DEBUG
  61 #define GST_CAT_DEFAULT ensure_debug_category()
  62 static GstDebugCategory *
  63 ensure_debug_category (void)
  64 {
  65   static gsize cat_gonce = 0;
  66
  67   if (g_once_init_enter (&cat_gonce)) {
  68     gsize cat_done;
  69
  70     cat_done = (gsize) _gst_debug_category_new ("cuda-converter", 0,
  71         "cuda-converter object");
  72
  73     g_once_init_leave (&cat_gonce, cat_done);
  74   }
  75
  76   return (GstDebugCategory *) cat_gonce;
  77 }
  78 #else
  79 #define ensure_debug_category()
  80 #endif
  81
  82 #define GST_CUDA_KERNEL_FUNC "gst_cuda_kernel_func"
  83
  84 #define GST_CUDA_KERNEL_FUNC_TO_Y444 "gst_cuda_kernel_func_to_y444"
  85
  86 #define GST_CUDA_KERNEL_FUNC_Y444_TO_YUV "gst_cuda_kernel_func_y444_to_yuv"
  87
  88 #define GST_CUDA_KERNEL_FUNC_TO_ARGB "gst_cuda_kernel_func_to_argb"
  89
  90 #define GST_CUDA_KERNEL_FUNC_SCALE_RGB "gst_cuda_kernel_func_scale_rgb"
  91
  92 /* *INDENT-OFF* */
  93 /**
  94  * read_chroma:
  95  * @tex1: a CUDA texture object representing a semi-planar chroma plane
  96  * @tex2: dummy object
  97  * @x: the x coordinate to read data from @tex1
  98  * @y: the y coordinate to read data from @tex1
  99  *
 100  * Returns: a #ushort2 vector representing both chroma pixel values
 101  */
 102 static const gchar READ_CHROMA_FROM_SEMI_PLANAR[] =
 103 "__device__ ushort2\n"
 104 "read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n"
 105 "    float x, float y)\n"
 106 "{\n"
 107 "  return tex2D<ushort2>(tex1, x, y);\n"
 108 "}";
 109
 110 /**
 111  * read_chroma:
 112  * @tex1: a CUDA texture object representing a chroma planar plane
 113  * @tex2: a CUDA texture object representing the other planar plane
 114  * @x: the x coordinate to read data from @tex1 and @tex2
 115  * @y: the y coordinate to read data from @tex1 and @tex2
 116  *
 117  * Returns: a #ushort2 vector representing both chroma pixel values
 118  */
 119 static const gchar READ_CHROMA_FROM_PLANAR[] =
 120 "__device__ ushort2\n"
 121 "read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n"
 122 "    float x, float y)\n"
 123 "{\n"
 124 "  unsigned short u, v;\n"
 125 "  u = tex2D<unsigned short>(tex1, x, y);\n"
 126 "  v = tex2D<unsigned short>(tex2, x, y);\n"
 127 "  return make_ushort2(u, v);\n"
 128 "}";
 129
 130 /**
 131  * write_chroma:
 132  * @dst1: a CUDA global memory pointing to a semi-planar chroma plane
 133  * @dst2: dummy
 134  * @u: a pixel value to write @dst1
 135  * @v: a pixel value to write @dst1
 136  * @x: the x coordinate to write data into @tex1
 137  * @x: the y coordinate to write data into @tex1
 138  * @pstride: the pixel stride of @dst1
 139  * @mask: bitmask to be applied to high bitdepth plane
 140  *
 141  * Write @u and @v pixel value to @dst1 semi-planar plane
 142  */
 143 static const gchar WRITE_CHROMA_TO_SEMI_PLANAR[] =
 144 "__device__ void\n"
 145 "write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n"
 146 "    unsigned short v, int x, int y, int pstride, int stride, int mask)\n"
 147 "{\n"
 148 "  if (OUT_DEPTH > 8) {\n"
 149 "    *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n"
 150 "    *(unsigned short *)&dst1[x * pstride + 2 + y * stride] = (v & mask);\n"
 151 "  } else {\n"
 152 "    dst1[x * pstride + y * stride] = u;\n"
 153 "    dst1[x * pstride + 1 + y * stride] = v;\n"
 154 "  }\n"
 155 "}";
 156
 157 /**
 158  * write_chroma:
 159  * @dst1: a CUDA global memory pointing to a planar chroma plane
 160  * @dst2: a CUDA global memory pointing to a the other planar chroma plane
 161  * @u: a pixel value to write @dst1
 162  * @v: a pixel value to write @dst1
 163  * @x: the x coordinate to write data into @tex1
 164  * @x: the y coordinate to write data into @tex1
 165  * @pstride: the pixel stride of @dst1
 166  * @mask: bitmask to be applied to high bitdepth plane
 167  *
 168  * Write @u and @v pixel value into @dst1 and @dst2 planar planes
 169  */
 170 static const gchar WRITE_CHROMA_TO_PLANAR[] =
 171 "__device__ void\n"
 172 "write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n"
 173 "    unsigned short v, int x, int y, int pstride, int stride, int mask)\n"
 174 "{\n"
 175 "  if (OUT_DEPTH > 8) {\n"
 176 "    *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n"
 177 "    *(unsigned short *)&dst2[x * pstride + y * stride] = (v & mask);\n"
 178 "  } else {\n"
 179 "    dst1[x * pstride + y * stride] = u;\n"
 180 "    dst2[x * pstride + y * stride] = v;\n"
 181 "  }\n"
 182 "}";
 183
 184 /* CUDA kernel source for from YUV to YUV conversion and scale */
 185 static const gchar templ_YUV_TO_YUV[] =
 186 "extern \"C\"{\n"
 187 "__constant__ float SCALE_H = %f;\n"
 188 "__constant__ float SCALE_V = %f;\n"
 189 "__constant__ float CHROMA_SCALE_H = %f;\n"
 190 "__constant__ float CHROMA_SCALE_V = %f;\n"
 191 "__constant__ int WIDTH = %d;\n"
 192 "__constant__ int HEIGHT = %d;\n"
 193 "__constant__ int CHROMA_WIDTH = %d;\n"
 194 "__constant__ int CHROMA_HEIGHT = %d;\n"
 195 "__constant__ int IN_DEPTH = %d;\n"
 196 "__constant__ int OUT_DEPTH = %d;\n"
 197 "__constant__ int PSTRIDE = %d;\n"
 198 "__constant__ int CHROMA_PSTRIDE = %d;\n"
 199 "__constant__ int IN_SHIFT = %d;\n"
 200 "__constant__ int OUT_SHIFT = %d;\n"
 201 "__constant__ int MASK = %d;\n"
 202 "__constant__ int SWAP_UV = %d;\n"
 203 "\n"
 204 "__device__ unsigned short\n"
 205 "do_scale_pixel (unsigned short val) \n"
 206 "{\n"
 207 "  unsigned int diff;\n"
 208 "  if (OUT_DEPTH > IN_DEPTH) {\n"
 209 "    diff = OUT_DEPTH - IN_DEPTH;\n"
 210 "    return (val << diff) | (val >> (IN_DEPTH - diff));\n"
 211 "  } else if (IN_DEPTH > OUT_DEPTH) {\n"
 212 "    return val >> (IN_DEPTH - OUT_DEPTH);\n"
 213 "  }\n"
 214 "  return val;\n"
 215 "}\n"
 216 "\n"
 217 /* __device__ ushort2
 218  * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
 219  */
 220 "%s\n"
 221 "\n"
 222 /* __device__ void
 223  * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,
 224  *     unsigned short v, int x, int y, int pstride, int stride, int mask);
 225  */
 226 "%s\n"
 227 "\n"
 228 "__global__ void\n"
 229 GST_CUDA_KERNEL_FUNC
 230 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
 231 "    unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n"
 232 "    int stride)\n"
 233 "{\n"
 234 "  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
 235 "  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
 236 "  if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
 237 "    float src_xpos = SCALE_H * x_pos;\n"
 238 "    float src_ypos = SCALE_V * y_pos;\n"
 239 "    unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
 240 "    y = y >> IN_SHIFT;\n"
 241 "    y = do_scale_pixel (y);\n"
 242 "    y = y << OUT_SHIFT;\n"
 243 "    if (OUT_DEPTH > 8) {\n"
 244 "      *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n"
 245 "    } else {\n"
 246 "      dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n"
 247 "    }\n"
 248 "  }\n"
 249 "  if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n"
 250 "    float src_xpos = CHROMA_SCALE_H * x_pos;\n"
 251 "    float src_ypos = CHROMA_SCALE_V * y_pos;\n"
 252 "    unsigned short u, v;\n"
 253 "    ushort2 uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
 254 "    u = uv.x;\n"
 255 "    v = uv.y;\n"
 256 "    u = u >> IN_SHIFT;\n"
 257 "    v = v >> IN_SHIFT;\n"
 258 "    u = do_scale_pixel (u);\n"
 259 "    v = do_scale_pixel (v);\n"
 260 "    u = u << OUT_SHIFT;\n"
 261 "    v = v << OUT_SHIFT;\n"
 262 "    if (SWAP_UV) {\n"
 263 "      unsigned short tmp = u;\n"
 264 "      u = v;\n"
 265 "      v = tmp;\n"
 266 "    }\n"
 267 "    write_chroma (dst1,\n"
 268 "      dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, stride, MASK);\n"
 269 "  }\n"
 270 "}\n"
 271 "\n"
 272 "}";
 273
 274 /* CUDA kernel source for from YUV to RGB conversion and scale */
 275 static const gchar templ_YUV_TO_RGB[] =
 276 "extern \"C\"{\n"
 277 "__constant__ float offset[3] = {%f, %f, %f};\n"
 278 "__constant__ float rcoeff[3] = {%f, %f, %f};\n"
 279 "__constant__ float gcoeff[3] = {%f, %f, %f};\n"
 280 "__constant__ float bcoeff[3] = {%f, %f, %f};\n"
 281 "\n"
 282 "__constant__ float SCALE_H = %f;\n"
 283 "__constant__ float SCALE_V = %f;\n"
 284 "__constant__ float CHROMA_SCALE_H = %f;\n"
 285 "__constant__ float CHROMA_SCALE_V = %f;\n"
 286 "__constant__ int WIDTH = %d;\n"
 287 "__constant__ int HEIGHT = %d;\n"
 288 "__constant__ int CHROMA_WIDTH = %d;\n"
 289 "__constant__ int CHROMA_HEIGHT = %d;\n"
 290 "__constant__ int IN_DEPTH = %d;\n"
 291 "__constant__ int OUT_DEPTH = %d;\n"
 292 "__constant__ int PSTRIDE = %d;\n"
 293 "__constant__ int CHROMA_PSTRIDE = %d;\n"
 294 "__constant__ int IN_SHIFT = %d;\n"
 295 "__constant__ int OUT_SHIFT = %d;\n"
 296 "__constant__ int MASK = %d;\n"
 297 "__constant__ int SWAP_UV = %d;\n"
 298 "__constant__ int MAX_IN_VAL = %d;\n"
 299 "__constant__ int R_IDX = %d;\n"
 300 "__constant__ int G_IDX = %d;\n"
 301 "__constant__ int B_IDX = %d;\n"
 302 "__constant__ int A_IDX = %d;\n"
 303 "__constant__ int X_IDX = %d;\n"
 304 "\n"
 305 "__device__ unsigned short\n"
 306 "do_scale_pixel (unsigned short val) \n"
 307 "{\n"
 308 "  unsigned int diff;\n"
 309 "  if (OUT_DEPTH > IN_DEPTH) {\n"
 310 "    diff = OUT_DEPTH - IN_DEPTH;\n"
 311 "    return (val << diff) | (val >> (IN_DEPTH - diff));\n"
 312 "  } else if (IN_DEPTH > OUT_DEPTH) {\n"
 313 "    return val >> (IN_DEPTH - OUT_DEPTH);\n"
 314 "  }\n"
 315 "  return val;\n"
 316 "}\n"
 317 "\n"
 318 "__device__ float\n"
 319 "dot(float3 val, float *coeff)\n"
 320 "{\n"
 321 "  return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n"
 322 "}\n"
 323 "\n"
 324 "__device__ uint3\n"
 325 "yuv_to_rgb (unsigned short y, unsigned short u, unsigned short v, unsigned int max_val)\n"
 326 "{\n"
 327 "  float3 yuv = make_float3 (y, u, v);\n"
 328 "  uint3 rgb;\n"
 329 "  rgb.x = max ((unsigned int)(dot (yuv, rcoeff) + offset[0]), 0);\n"
 330 "  rgb.y = max ((unsigned int)(dot (yuv, gcoeff) + offset[1]), 0);\n"
 331 "  rgb.z = max ((unsigned int)(dot (yuv, bcoeff) + offset[2]), 0);\n"
 332 "  rgb.x = min (rgb.x, max_val);\n"
 333 "  rgb.y = min (rgb.y, max_val);\n"
 334 "  rgb.z = min (rgb.z, max_val);\n"
 335 "  return rgb;\n"
 336 "}\n"
 337 "\n"
 338 /* __device__ ushort2
 339  * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
 340  */
 341 "%s\n"
 342 "\n"
 343 "__global__ void\n"
 344 GST_CUDA_KERNEL_FUNC
 345 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
 346 "    unsigned char *dstRGB, int stride)\n"
 347 "{\n"
 348 "  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
 349 "  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
 350 "  if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
 351 "    float src_xpos = SCALE_H * x_pos;\n"
 352 "    float src_ypos = SCALE_V * y_pos;\n"
 353 "    unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
 354 "    ushort2 uv;\n"
 355 "    unsigned short u, v;\n"
 356 "    uint3 rgb;\n"
 357 "    unsigned int clip_max = MAX_IN_VAL;\n"
 358 "    src_xpos = CHROMA_SCALE_H * x_pos;\n"
 359 "    src_ypos = CHROMA_SCALE_V * y_pos;\n"
 360 "    uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
 361 "    u = uv.x;\n"
 362 "    v = uv.y;\n"
 363 "    y = y >> IN_SHIFT;\n"
 364 "    u = u >> IN_SHIFT;\n"
 365 "    v = v >> IN_SHIFT;\n"
 366 "    if (SWAP_UV) {\n"
 367 "      unsigned short tmp = u;\n"
 368 "      u = v;\n"
 369 "      v = tmp;\n"
 370 "    }\n"
 371      /* conversion matrix is scaled to higher bitdepth between in/out formats */
 372 "    if (OUT_DEPTH > IN_DEPTH) {\n"
 373 "      y = do_scale_pixel (y);\n"
 374 "      u = do_scale_pixel (u);\n"
 375 "      v = do_scale_pixel (v);\n"
 376 "      clip_max = MASK;\n"
 377 "    }"
 378 "    rgb = yuv_to_rgb (y, u, v, clip_max);\n"
 379 "    if (OUT_DEPTH < IN_DEPTH) {\n"
 380 "      rgb.x = do_scale_pixel (rgb.x);\n"
 381 "      rgb.y = do_scale_pixel (rgb.y);\n"
 382 "      rgb.z = do_scale_pixel (rgb.z);\n"
 383 "    }"
 384 "    if (OUT_DEPTH > 8) {\n"
 385 "      unsigned int packed_rgb = 0;\n"
 386        /* A is always MSB, we support only little endian system */
 387 "      packed_rgb = 0xc000 << 16;\n"
 388 "      packed_rgb |= (rgb.x << (30 - (R_IDX * 10)));\n"
 389 "      packed_rgb |= (rgb.y << (30 - (G_IDX * 10)));\n"
 390 "      packed_rgb |= (rgb.z << (30 - (B_IDX * 10)));\n"
 391 "      *(unsigned int *)&dstRGB[x_pos * PSTRIDE + y_pos * stride] = packed_rgb;\n"
 392 "    } else {\n"
 393 "      dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * stride] = (unsigned char) rgb.x;\n"
 394 "      dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * stride] = (unsigned char) rgb.y;\n"
 395 "      dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * stride] = (unsigned char) rgb.z;\n"
 396 "      if (A_IDX >= 0 || X_IDX >= 0)\n"
 397 "        dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * stride] = 0xff;\n"
 398 "    }\n"
 399 "  }\n"
 400 "}\n"
 401 "\n"
 402 "}";
 403
 404 /**
 405  * GST_CUDA_KERNEL_FUNC_TO_ARGB:
 406  * @srcRGB: a CUDA global memory containing a RGB image
 407  * @dstRGB: a CUDA global memory to store unpacked ARGB image
 408  * @width: the width of @srcRGB and @dstRGB
 409  * @height: the height of @srcRGB and @dstRGB
 410  * @src_stride: the stride of @srcRGB
 411  * @src_pstride: the pixel stride of @srcRGB
 412  * @dst_stride: the stride of @dstRGB
 413  * @r_idx: the index of red component of @srcRGB
 414  * @g_idx: the index of green component of @srcRGB
 415  * @b_idx: the index of blue component of @srcRGB
 416  * @a_idx: the index of alpha component of @srcRGB
 417  *
 418  * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB
 419  */
 420 static const gchar unpack_to_ARGB[] =
 421 "__global__ void\n"
 422 GST_CUDA_KERNEL_FUNC_TO_ARGB
 423 "(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n"
 424 "    int src_stride, int src_pstride, int dst_stride,\n"
 425 "    int r_idx, int g_idx, int b_idx, int a_idx)\n"
 426 "{\n"
 427 "  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
 428 "  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
 429 "  if (x_pos < width && y_pos < height) {\n"
 430 "    if (a_idx >= 0) {\n"
 431 "      dstRGB[x_pos * 4 + y_pos * dst_stride] =\n"
 432 "          srcRGB[x_pos * src_pstride + a_idx + y_pos * src_stride];\n"
 433 "    } else {\n"
 434 "      dstRGB[x_pos * 4 + y_pos * dst_stride] = 0xff;\n"
 435 "    }\n"
 436 "    dstRGB[x_pos * 4 + 1 + y_pos * dst_stride] =\n"
 437 "        srcRGB[x_pos * src_pstride + r_idx + y_pos * src_stride];\n"
 438 "    dstRGB[x_pos * 4 + 2 + y_pos * dst_stride] =\n"
 439 "        srcRGB[x_pos * src_pstride + g_idx + y_pos * src_stride];\n"
 440 "    dstRGB[x_pos * 4 + 3 + y_pos * dst_stride] =\n"
 441 "        srcRGB[x_pos * src_pstride + b_idx + y_pos * src_stride];\n"
 442 "  }\n"
 443 "}\n";
 444
 445 /**
 446  * GST_CUDA_KERNEL_FUNC_TO_ARGB:
 447  * @srcRGB: a CUDA global memory containing a RGB image
 448  * @dstRGB: a CUDA global memory to store unpacked ARGB64 image
 449  * @width: the width of @srcRGB and @dstRGB
 450  * @height: the height of @srcRGB and @dstRGB
 451  * @src_stride: the stride of @srcRGB
 452  * @src_pstride: the pixel stride of @srcRGB
 453  * @dst_stride: the stride of @dstRGB
 454  * @r_idx: the index of red component of @srcRGB
 455  * @g_idx: the index of green component of @srcRGB
 456  * @b_idx: the index of blue component of @srcRGB
 457  * @a_idx: the index of alpha component of @srcRGB
 458  *
 459  * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB
 460  */
 461 static const gchar unpack_to_ARGB64[] =
 462 "__global__ void\n"
 463 GST_CUDA_KERNEL_FUNC_TO_ARGB
 464 "(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n"
 465 "    int src_stride, int src_pstride, int dst_stride,\n"
 466 "    int r_idx, int g_idx, int b_idx, int a_idx)\n"
 467 "{\n"
 468 "  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
 469 "  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
 470 "  if (x_pos < width && y_pos < height) {\n"
 471 "    unsigned short a, r, g, b;\n"
 472 "    unsigned int read_val;\n"
 473 "    read_val = *(unsigned int *)&srcRGB[x_pos * src_pstride + y_pos * src_stride];\n"
 474 "    a = (read_val >> 30) & 0x03;\n"
 475 "    a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
 476 "    r = ((read_val >> (30 - (r_idx * 10))) & 0x3ff);\n"
 477 "    r = (r << 6) | (r >> 4);\n"
 478 "    g = ((read_val >> (30 - (g_idx * 10))) & 0x3ff);\n"
 479 "    g = (g << 6) | (g >> 4);\n"
 480 "    b = ((read_val >> (30 - (b_idx * 10))) & 0x3ff);\n"
 481 "    b = (b << 6) | (b >> 4);\n"
 482 "    *(unsigned short *)&dstRGB[x_pos * 8 + y_pos * dst_stride] = 0xffff;\n"
 483 "    *(unsigned short *)&dstRGB[x_pos * 8 + 2 + y_pos * dst_stride] = r;\n"
 484 "    *(unsigned short *)&dstRGB[x_pos * 8 + 4 + y_pos * dst_stride] = g;\n"
 485 "    *(unsigned short *)&dstRGB[x_pos * 8 + 6 + y_pos * dst_stride] = b;\n"
 486 "  }\n"
 487 "}\n";
 488
 489 /* CUDA kernel source for from RGB to YUV conversion and scale */
 490 static const gchar templ_RGB_TO_YUV[] =
 491 "extern \"C\"{\n"
 492 "__constant__ float offset[3] = {%f, %f, %f};\n"
 493 "__constant__ float ycoeff[3] = {%f, %f, %f};\n"
 494 "__constant__ float ucoeff[3] = {%f, %f, %f};\n"
 495 "__constant__ float vcoeff[3] = {%f, %f, %f};\n"
 496 "\n"
 497 "__constant__ float SCALE_H = %f;\n"
 498 "__constant__ float SCALE_V = %f;\n"
 499 "__constant__ float CHROMA_SCALE_H = %f;\n"
 500 "__constant__ float CHROMA_SCALE_V = %f;\n"
 501 "__constant__ int WIDTH = %d;\n"
 502 "__constant__ int HEIGHT = %d;\n"
 503 "__constant__ int CHROMA_WIDTH = %d;\n"
 504 "__constant__ int CHROMA_HEIGHT = %d;\n"
 505 "__constant__ int IN_DEPTH = %d;\n"
 506 "__constant__ int OUT_DEPTH = %d;\n"
 507 "__constant__ int PSTRIDE = %d;\n"
 508 "__constant__ int CHROMA_PSTRIDE = %d;\n"
 509 "__constant__ int IN_SHIFT = %d;\n"
 510 "__constant__ int OUT_SHIFT = %d;\n"
 511 "__constant__ int MASK = %d;\n"
 512 "__constant__ int SWAP_UV = %d;\n"
 513 "\n"
 514 "__device__ unsigned short\n"
 515 "do_scale_pixel (unsigned short val) \n"
 516 "{\n"
 517 "  unsigned int diff;\n"
 518 "  if (OUT_DEPTH > IN_DEPTH) {\n"
 519 "    diff = OUT_DEPTH - IN_DEPTH;\n"
 520 "    return (val << diff) | (val >> (IN_DEPTH - diff));\n"
 521 "  } else if (IN_DEPTH > OUT_DEPTH) {\n"
 522 "    return val >> (IN_DEPTH - OUT_DEPTH);\n"
 523 "  }\n"
 524 "  return val;\n"
 525 "}\n"
 526 "\n"
 527 "__device__ float\n"
 528 "dot(float3 val, float *coeff)\n"
 529 "{\n"
 530 "  return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n"
 531 "}\n"
 532 "\n"
 533 "__device__ uint3\n"
 534 "rgb_to_yuv (unsigned short r, unsigned short g, unsigned short b,\n"
 535 "    unsigned int max_val)\n"
 536 "{\n"
 537 "  float3 rgb = make_float3 (r, g, b);\n"
 538 "  uint3 yuv;\n"
 539 "  yuv.x = max ((unsigned int)(dot (rgb, ycoeff) + offset[0]), 0);\n"
 540 "  yuv.y = max ((unsigned int)(dot (rgb, ucoeff) + offset[1]), 0);\n"
 541 "  yuv.z = max ((unsigned int)(dot (rgb, vcoeff) + offset[2]), 0);\n"
 542 "  yuv.x = min (yuv.x, max_val);\n"
 543 "  yuv.y = min (yuv.y, max_val);\n"
 544 "  yuv.z = min (yuv.z, max_val);\n"
 545 "  return yuv;\n"
 546 "}\n"
 547 "\n"
 548 /* __global__ void
 549  * GST_CUDA_KERNEL_FUNC_TO_ARGB
 550  */
 551 "%s\n"
 552 "\n"
 553 /* __device__ ushort2
 554  * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
 555  */
 556 "%s\n"
 557 "\n"
 558 /* __device__ void
 559  * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,
 560  *     unsigned short v, int x, int y, int pstride, int stride, int mask);
 561  */
 562 "%s\n"
 563 "\n"
 564 "__global__ void\n"
 565 GST_CUDA_KERNEL_FUNC_TO_Y444
 566 "(cudaTextureObject_t srcRGB, unsigned char *dstY, int y_stride,\n"
 567 "    unsigned char *dstU, int u_stride, unsigned char *dstV, int v_stride,\n"
 568 "    int width, int height, int dst_pstride, int in_depth)\n"
 569 "{\n"
 570 "  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
 571 "  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
 572 "  if (x_pos < width && y_pos < height) {\n"
 573 "    ushort4 argb = tex2D<ushort4>(srcRGB, x_pos, y_pos);\n"
 574 "    uint3 yuv;\n"
 575 "    yuv = rgb_to_yuv (argb.y, argb.z, argb.w, (1 << in_depth) - 1);\n"
 576 "    if (in_depth > 8) {\n"
 577 "      *(unsigned short *)&dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n"
 578 "      *(unsigned short *)&dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n"
 579 "      *(unsigned short *)&dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n"
 580 "    } else {\n"
 581 "      dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n"
 582 "      dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n"
 583 "      dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n"
 584 "    }\n"
 585 "  }\n"
 586 "}\n"
 587 "\n"
 588 "__global__ void\n"
 589 GST_CUDA_KERNEL_FUNC_Y444_TO_YUV
 590 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
 591 "    unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n"
 592 "    int stride)\n"
 593 "{\n"
 594 "  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
 595 "  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
 596 "  if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
 597 "    float src_xpos = SCALE_H * x_pos;\n"
 598 "    float src_ypos = SCALE_V * y_pos;\n"
 599 "    unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
 600 "    y = y >> IN_SHIFT;\n"
 601 "    y = do_scale_pixel (y);\n"
 602 "    y = y << OUT_SHIFT;\n"
 603 "    if (OUT_DEPTH > 8) {\n"
 604 "      *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n"
 605 "    } else {\n"
 606 "      dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n"
 607 "    }\n"
 608 "  }\n"
 609 "  if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n"
 610 "    float src_xpos = CHROMA_SCALE_H * x_pos;\n"
 611 "    float src_ypos = CHROMA_SCALE_V * y_pos;\n"
 612 "    unsigned short u, v;\n"
 613 "    ushort2 uv;\n"
 614 "    uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
 615 "    u = uv.x;\n"
 616 "    v = uv.y;\n"
 617 "    u = u >> IN_SHIFT;\n"
 618 "    v = v >> IN_SHIFT;\n"
 619 "    u = do_scale_pixel (u);\n"
 620 "    v = do_scale_pixel (v);\n"
 621 "    u = u << OUT_SHIFT;\n"
 622 "    v = v << OUT_SHIFT;\n"
 623 "    if (SWAP_UV) {\n"
 624 "      unsigned short tmp = u;\n"
 625 "      u = v;\n"
 626 "      v = tmp;\n"
 627 "    }\n"
 628 "    write_chroma (dst1,\n"
 629 "      dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, stride, MASK);\n"
 630 "  }\n"
 631 "}\n"
 632 "\n"
 633 "}";
 634
 635 /* CUDA kernel source for from RGB to RGB conversion and scale */
 636 static const gchar templ_RGB_to_RGB[] =
 637 "extern \"C\"{\n"
 638 "__constant__ float SCALE_H = %f;\n"
 639 "__constant__ float SCALE_V = %f;\n"
 640 "__constant__ int WIDTH = %d;\n"
 641 "__constant__ int HEIGHT = %d;\n"
 642 "__constant__ int IN_DEPTH = %d;\n"
 643 "__constant__ int OUT_DEPTH = %d;\n"
 644 "__constant__ int PSTRIDE = %d;\n"
 645 "__constant__ int R_IDX = %d;\n"
 646 "__constant__ int G_IDX = %d;\n"
 647 "__constant__ int B_IDX = %d;\n"
 648 "__constant__ int A_IDX = %d;\n"
 649 "__constant__ int X_IDX = %d;\n"
 650 "\n"
 651 "__device__ unsigned short\n"
 652 "do_scale_pixel (unsigned short val) \n"
 653 "{\n"
 654 "  unsigned int diff;\n"
 655 "  if (OUT_DEPTH > IN_DEPTH) {\n"
 656 "    diff = OUT_DEPTH - IN_DEPTH;\n"
 657 "    return (val << diff) | (val >> (IN_DEPTH - diff));\n"
 658 "  } else if (IN_DEPTH > OUT_DEPTH) {\n"
 659 "    return val >> (IN_DEPTH - OUT_DEPTH);\n"
 660 "  }\n"
 661 "  return val;\n"
 662 "}\n"
 663 "\n"
 664 /* __global__ void
 665  * GST_CUDA_KERNEL_FUNC_TO_ARGB
 666  */
 667 "%s\n"
 668 "\n"
 669 /* convert ARGB or ARGB64 to other RGB formats with scale */
 670 "__global__ void\n"
 671 GST_CUDA_KERNEL_FUNC_SCALE_RGB
 672 "(cudaTextureObject_t srcRGB, unsigned char *dstRGB, int dst_stride)\n"
 673 "{\n"
 674 "  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
 675 "  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
 676 "  if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
 677 "    float src_xpos = SCALE_H * x_pos;\n"
 678 "    float src_ypos = SCALE_V * y_pos;\n"
 679 "    ushort4 argb = tex2D<ushort4>(srcRGB, src_xpos, src_ypos);\n"
 680 "    argb.x = do_scale_pixel(argb.x);\n"
 681 "    argb.y = do_scale_pixel(argb.y);\n"
 682 "    argb.z = do_scale_pixel(argb.z);\n"
 683 "    argb.w = do_scale_pixel(argb.w);\n"
 684      /* FIXME: RGB10A2_LE or BGR10A2_LE only */
 685 "    if (OUT_DEPTH > 8) {\n"
 686 "      unsigned int packed_rgb = 0;\n"
 687 "      unsigned int a, r, g, b;"
 688 "      a = (argb.x >> 8) & 0x3;\n"
 689 "      r = argb.y & 0x3ff;\n"
 690 "      g = argb.z & 0x3ff;\n"
 691 "      b = argb.w & 0x3ff;\n"
 692        /* A is always MSB, we support only little endian system */
 693 "      packed_rgb = a << 30;\n"
 694 "      packed_rgb |= (r << (30 - (R_IDX * 10)));\n"
 695 "      packed_rgb |= (g << (30 - (G_IDX * 10)));\n"
 696 "      packed_rgb |= (b << (30 - (B_IDX * 10)));\n"
 697 "      *(unsigned int *)&dstRGB[x_pos * 4 + y_pos * dst_stride] = packed_rgb;\n"
 698 "    } else {\n"
 699 "      if (A_IDX >= 0) {\n"
 700 "        argb.x = do_scale_pixel(argb.x);\n"
 701 "        dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * dst_stride] = argb.x;\n"
 702 "      } else if (X_IDX >= 0) {\n"
 703 "        dstRGB[x_pos * PSTRIDE + X_IDX + y_pos * dst_stride] = 0xff;\n"
 704 "      }\n"
 705 "      dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * dst_stride] = argb.y;\n"
 706 "      dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * dst_stride] = argb.z;\n"
 707 "      dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * dst_stride] = argb.w;\n"
 708 "    }\n"
 709 "  }\n"
 710 "}\n"
 711 "\n"
 712 "}";
 713 /* *INDENT-ON* */
 714
 715 typedef struct
 716 {
 717   gint R;
 718   gint G;
 719   gint B;
 720   gint A;
 721   gint X;
 722 } GstCudaRGBOrder;
 723
 724 typedef struct
 725 {
 726   CUdeviceptr device_ptr;
 727   gsize cuda_stride;
 728 } GstCudaStageBuffer;
 729
 730 #define CONVERTER_MAX_NUM_FUNC 4
 731
 732 struct _GstCudaConverter
 733 {
 734   GstVideoInfo in_info;
 735   GstVideoInfo out_info;
 736   gboolean keep_size;
 737
 738   gint texture_alignment;
 739
 740   GstCudaContext *cuda_ctx;
 741   CUmodule cuda_module;
 742   CUfunction kernel_func[CONVERTER_MAX_NUM_FUNC];
 743   const gchar *func_names[CONVERTER_MAX_NUM_FUNC];
 744   gchar *kernel_source;
 745   gchar *ptx;
 746   GstCudaStageBuffer fallback_buffer[GST_VIDEO_MAX_PLANES];
 747
 748     gboolean (*convert) (GstCudaConverter * convert, const GstCudaMemory * src,
 749       GstVideoInfo * in_info, GstCudaMemory * dst, GstVideoInfo * out_info,
 750       CUstream cuda_stream);
 751
 752   const CUdeviceptr src;
 753   GstVideoInfo *cur_in_info;
 754
 755   CUdeviceptr dest;
 756   GstVideoInfo *cur_out_info;
 757
 758   /* rgb to {rgb, yuv} only */
 759   GstCudaRGBOrder in_rgb_order;
 760   GstCudaStageBuffer unpack_surface;
 761   GstCudaStageBuffer y444_surface[GST_VIDEO_MAX_PLANES];
 762 };
 763
 764 #define LOAD_CUDA_FUNC(module,func,name) G_STMT_START { \
 765   if (!gst_cuda_result (CuModuleGetFunction (&(func), (module), name))) { \
 766     GST_ERROR ("failed to get %s function", (name)); \
 767     goto error; \
 768   } \
 769 } G_STMT_END
 770
 771 /**
 772  * gst_cuda_converter_new:
 773  * @in_info: a #GstVideoInfo
 774  * @out_info: a #GstVideoInfo
 775  * @cuda_ctx: (transfer none): a #GstCudaContext
 776  *
 777  * Create a new converter object to convert between @in_info and @out_info
 778  * with @config.
 779  *
 780  * Returns: a #GstCudaConverter or %NULL if conversion is not possible.
 781  */
 782 GstCudaConverter *
 783 gst_cuda_converter_new (GstVideoInfo * in_info, GstVideoInfo * out_info,
 784     GstCudaContext * cuda_ctx)
 785 {
 786   GstCudaConverter *convert;
 787   gint i;
 788
 789   g_return_val_if_fail (in_info != NULL, NULL);
 790   g_return_val_if_fail (out_info != NULL, NULL);
 791   g_return_val_if_fail (cuda_ctx != NULL, NULL);
 792   /* we won't ever do framerate conversion */
 793   g_return_val_if_fail (in_info->fps_n == out_info->fps_n, NULL);
 794   g_return_val_if_fail (in_info->fps_d == out_info->fps_d, NULL);
 795   /* we won't ever do deinterlace */
 796   g_return_val_if_fail (in_info->interlace_mode == out_info->interlace_mode,
 797       NULL);
 798
 799   convert = g_new0 (GstCudaConverter, 1);
 800
 801   convert->in_info = *in_info;
 802   convert->out_info = *out_info;
 803
 804   /* FIXME: should return kernel source */
 805   if (!gst_cuda_context_push (cuda_ctx)) {
 806     GST_ERROR ("cannot push context");
 807     goto error;
 808   }
 809
 810   if (!cuda_converter_lookup_path (convert))
 811     goto error;
 812
 813   convert->ptx = gst_cuda_nvrtc_compile (convert->kernel_source);
 814   if (!convert->ptx) {
 815     GST_ERROR ("no PTX data to load");
 816     goto error;
 817   }
 818
 819   GST_TRACE ("compiled convert ptx \n%s", convert->ptx);
 820
 821   if (!gst_cuda_result (CuModuleLoadData (&convert->cuda_module, convert->ptx))) {
 822     gst_cuda_context_pop (NULL);
 823     GST_ERROR ("failed to load cuda module data");
 824
 825     goto error;
 826   }
 827
 828   for (i = 0; i < CONVERTER_MAX_NUM_FUNC; i++) {
 829     if (!convert->func_names[i])
 830       break;
 831
 832     LOAD_CUDA_FUNC (convert->cuda_module, convert->kernel_func[i],
 833         convert->func_names[i]);
 834     GST_DEBUG ("kernel function \"%s\" loaded", convert->func_names[i]);
 835   }
 836
 837   gst_cuda_context_pop (NULL);
 838   convert->cuda_ctx = gst_object_ref (cuda_ctx);
 839   convert->texture_alignment =
 840       gst_cuda_context_get_texture_alignment (cuda_ctx);
 841
 842   g_free (convert->kernel_source);
 843   g_free (convert->ptx);
 844   convert->kernel_source = NULL;
 845   convert->ptx = NULL;
 846
 847   return convert;
 848
 849 error:
 850   gst_cuda_context_pop (NULL);
 851   gst_cuda_converter_free (convert);
 852
 853   return NULL;
 854 }
 855
 856 /**
 857  * gst_video_converter_free:
 858  * @convert: a #GstCudaConverter
 859  *
 860  * Free @convert
 861  */
 862 void
 863 gst_cuda_converter_free (GstCudaConverter * convert)
 864 {
 865   g_return_if_fail (convert != NULL);
 866
 867   if (convert->cuda_ctx) {
 868     if (gst_cuda_context_push (convert->cuda_ctx)) {
 869       gint i;
 870
 871       if (convert->cuda_module) {
 872         gst_cuda_result (CuModuleUnload (convert->cuda_module));
 873       }
 874
 875       for (i = 0; i < GST_VIDEO_MAX_PLANES; i++) {
 876         if (convert->fallback_buffer[i].device_ptr)
 877           gst_cuda_result (CuMemFree (convert->fallback_buffer[i].device_ptr));
 878         if (convert->y444_surface[i].device_ptr)
 879           gst_cuda_result (CuMemFree (convert->y444_surface[i].device_ptr));
 880       }
 881
 882       if (convert->unpack_surface.device_ptr)
 883         gst_cuda_result (CuMemFree (convert->unpack_surface.device_ptr));
 884
 885       gst_cuda_context_pop (NULL);
 886     }
 887
 888     gst_object_unref (convert->cuda_ctx);
 889   }
 890
 891   g_free (convert->kernel_source);
 892   g_free (convert->ptx);
 893   g_free (convert);
 894 }
 895
 896 /**
 897  * gst_cuda_converter_frame:
 898  * @convert: a #GstCudaConverter
 899  * @src: a #GstCudaMemory
 900  * @in_info: a #GstVideoInfo representing @src
 901  * @dst: a #GstCudaMemory
 902  * @out_info: a #GstVideoInfo representing @dst
 903  * @cuda_stream: a #CUstream
 904  *
 905  * Convert the pixels of @src into @dest using @convert.
 906  * Called without gst_cuda_context_push() and gst_cuda_context_pop() by caller
 907  */
 908 gboolean
 909 gst_cuda_converter_frame (GstCudaConverter * convert, const GstCudaMemory * src,
 910     GstVideoInfo * in_info, GstCudaMemory * dst, GstVideoInfo * out_info,
 911     CUstream cuda_stream)
 912 {
 913   gboolean ret;
 914
 915   g_return_val_if_fail (convert, FALSE);
 916   g_return_val_if_fail (src, FALSE);
 917   g_return_val_if_fail (in_info, FALSE);
 918   g_return_val_if_fail (dst, FALSE);
 919   g_return_val_if_fail (out_info, FALSE);
 920
 921   gst_cuda_context_push (convert->cuda_ctx);
 922
 923   ret = gst_cuda_converter_frame_unlocked (convert,
 924       src, in_info, dst, out_info, cuda_stream);
 925
 926   gst_cuda_context_pop (NULL);
 927
 928   return ret;
 929 }
 930
 931 /**
 932  * gst_cuda_converter_frame_unlocked:
 933  * @convert: a #GstCudaConverter
 934  * @src: a #GstCudaMemory
 935  * @in_info: a #GstVideoInfo representing @src
 936  * @dst: a #GstCudaMemory
 937  * @out_info: a #GstVideoInfo representing @dest
 938  * @cuda_stream: a #CUstream
 939  *
 940  * Convert the pixels of @src into @dest using @convert.
 941  * Caller should call this method after gst_cuda_context_push()
 942  */
 943 gboolean
 944 gst_cuda_converter_frame_unlocked (GstCudaConverter * convert,
 945     const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
 946     GstVideoInfo * out_info, CUstream cuda_stream)
 947 {
 948   g_return_val_if_fail (convert, FALSE);
 949   g_return_val_if_fail (src, FALSE);
 950   g_return_val_if_fail (in_info, FALSE);
 951   g_return_val_if_fail (dst, FALSE);
 952   g_return_val_if_fail (out_info, FALSE);
 953
 954   return convert->convert (convert, src, in_info, dst, out_info, cuda_stream);
 955 }
 956
 957 /* allocate fallback memory for texture alignment requirement */
 958 static gboolean
 959 convert_ensure_fallback_memory (GstCudaConverter * convert,
 960     GstVideoInfo * info, guint plane)
 961 {
 962   CUresult ret;
 963   guint element_size = 8;
 964
 965   if (convert->fallback_buffer[plane].device_ptr)
 966     return TRUE;
 967
 968   if (GST_VIDEO_INFO_COMP_DEPTH (info, 0) > 8)
 969     element_size = 16;
 970
 971   ret = CuMemAllocPitch (&convert->fallback_buffer[plane].device_ptr,
 972       &convert->fallback_buffer[plane].cuda_stride,
 973       GST_VIDEO_INFO_COMP_WIDTH (info, plane) *
 974       GST_VIDEO_INFO_COMP_PSTRIDE (info, plane),
 975       GST_VIDEO_INFO_COMP_HEIGHT (info, plane), element_size);
 976
 977   if (!gst_cuda_result (ret)) {
 978     GST_ERROR ("failed to allocated fallback memory");
 979     return FALSE;
 980   }
 981
 982   return TRUE;
 983 }
 984
 985 /* create a 2D CUDA texture without alignment check */
 986 static CUtexObject
 987 convert_create_texture_unchecked (const CUdeviceptr src, gint width,
 988     gint height, gint channels, gint stride, CUarray_format format,
 989     CUfilter_mode mode, CUstream cuda_stream)
 990 {
 991   CUDA_TEXTURE_DESC texture_desc;
 992   CUDA_RESOURCE_DESC resource_desc;
 993   CUtexObject texture = 0;
 994   CUresult cuda_ret;
 995
 996   memset (&texture_desc, 0, sizeof (CUDA_TEXTURE_DESC));
 997   memset (&resource_desc, 0, sizeof (CUDA_RESOURCE_DESC));
 998
 999   resource_desc.resType = CU_RESOURCE_TYPE_PITCH2D;
1000   resource_desc.res.pitch2D.format = format;
1001   resource_desc.res.pitch2D.numChannels = channels;
1002   resource_desc.res.pitch2D.width = width;
1003   resource_desc.res.pitch2D.height = height;
1004   resource_desc.res.pitch2D.pitchInBytes = stride;
1005   resource_desc.res.pitch2D.devPtr = src;
1006
1007   texture_desc.filterMode = mode;
1008   texture_desc.flags = CU_TRSF_READ_AS_INTEGER;
1009
1010   gst_cuda_result (CuStreamSynchronize (cuda_stream));
1011   cuda_ret = CuTexObjectCreate (&texture, &resource_desc, &texture_desc, NULL);
1012
1013   if (!gst_cuda_result (cuda_ret)) {
1014     GST_ERROR ("couldn't create texture");
1015
1016     return 0;
1017   }
1018
1019   return texture;
1020 }
1021
1022 static CUtexObject
1023 convert_create_texture (GstCudaConverter * convert, const GstCudaMemory * src,
1024     GstVideoInfo * info, guint plane, CUstream cuda_stream)
1025 {
1026   CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
1027   guint channels = 1;
1028   CUdeviceptr src_ptr;
1029   gsize stride;
1030   CUresult cuda_ret;
1031   CUfilter_mode mode;
1032
1033   if (GST_VIDEO_INFO_COMP_DEPTH (info, plane) > 8)
1034     format = CU_AD_FORMAT_UNSIGNED_INT16;
1035
1036   /* FIXME: more graceful method ? */
1037   if (plane != 0 &&
1038       GST_VIDEO_INFO_N_PLANES (info) != GST_VIDEO_INFO_N_COMPONENTS (info)) {
1039     channels = 2;
1040   }
1041
1042   src_ptr = src->data + src->offset[plane];
1043   stride = src->stride;
1044
1045   if (convert->texture_alignment && (src_ptr % convert->texture_alignment)) {
1046     CUDA_MEMCPY2D copy_params = { 0, };
1047
1048     if (!convert_ensure_fallback_memory (convert, info, plane))
1049       return 0;
1050
1051     GST_LOG ("device memory was not aligned, copy to fallback memory");
1052
1053     copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1054     copy_params.srcPitch = stride;
1055     copy_params.srcDevice = (CUdeviceptr) src_ptr;
1056
1057     copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1058     copy_params.dstPitch = convert->fallback_buffer[plane].cuda_stride;
1059     copy_params.dstDevice = convert->fallback_buffer[plane].device_ptr;
1060     copy_params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, plane)
1061         * GST_VIDEO_INFO_COMP_PSTRIDE (info, plane);
1062     copy_params.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, plane);
1063
1064     cuda_ret = CuMemcpy2DAsync (&copy_params, cuda_stream);
1065     if (!gst_cuda_result (cuda_ret)) {
1066       GST_ERROR ("failed to copy to fallback buffer");
1067       return 0;
1068     }
1069
1070     src_ptr = convert->fallback_buffer[plane].device_ptr;
1071     stride = convert->fallback_buffer[plane].cuda_stride;
1072   }
1073
1074   /* Use h/w linear interpolation only when resize is required.
1075    * Otherwise the image might be blurred */
1076   if (convert->keep_size)
1077     mode = CU_TR_FILTER_MODE_POINT;
1078   else
1079     mode = CU_TR_FILTER_MODE_LINEAR;
1080
1081   return convert_create_texture_unchecked (src_ptr,
1082       GST_VIDEO_INFO_COMP_WIDTH (info, plane),
1083       GST_VIDEO_INFO_COMP_HEIGHT (info, plane), channels, stride, format, mode,
1084       cuda_stream);
1085 }
1086
1087 /* main conversion function for YUV to YUV conversion */
1088 static gboolean
1089 convert_YUV_TO_YUV (GstCudaConverter * convert,
1090     const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1091     GstVideoInfo * out_info, CUstream cuda_stream)
1092 {
1093   CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, };
1094   CUresult cuda_ret;
1095   gboolean ret = FALSE;
1096   CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, };
1097   gint dst_stride;
1098   gint width, height;
1099   gint i;
1100
1101   gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2],
1102     &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride
1103   };
1104
1105   /* conversion step
1106    * STEP 1: create CUtexObject per plane
1107    * STEP 2: call YUV to YUV conversion kernel function.
1108    *         resize, uv reordering and bitdepth conversion will be performed in
1109    *         the CUDA kernel function
1110    */
1111
1112   /* map CUDA device memory to CUDA texture object */
1113   for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1114     texture[i] = convert_create_texture (convert, src, in_info, i, cuda_stream);
1115     if (!texture[i]) {
1116       GST_ERROR ("couldn't create texture for %d th plane", i);
1117       goto done;
1118     }
1119   }
1120
1121   for (i = 0; i < GST_VIDEO_INFO_N_PLANES (out_info); i++)
1122     dst_ptr[i] = dst->data + dst->offset[i];
1123
1124   dst_stride = dst->stride;
1125
1126   width = GST_VIDEO_INFO_WIDTH (out_info);
1127   height = GST_VIDEO_INFO_HEIGHT (out_info);
1128
1129   cuda_ret =
1130       CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X),
1131       DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1132       cuda_stream, kernel_args, NULL);
1133
1134   if (!gst_cuda_result (cuda_ret)) {
1135     GST_ERROR ("could not rescale plane");
1136     goto done;
1137   }
1138
1139   ret = TRUE;
1140   gst_cuda_result (CuStreamSynchronize (cuda_stream));
1141
1142 done:
1143   for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1144     if (texture[i])
1145       gst_cuda_result (CuTexObjectDestroy (texture[i]));
1146   }
1147
1148   return ret;
1149 }
1150
1151 /* main conversion function for YUV to RGB conversion */
1152 static gboolean
1153 convert_YUV_TO_RGB (GstCudaConverter * convert,
1154     const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1155     GstVideoInfo * out_info, CUstream cuda_stream)
1156 {
1157   CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, };
1158   CUresult cuda_ret;
1159   gboolean ret = FALSE;
1160   CUdeviceptr dstRGB = 0;
1161   gint dst_stride;
1162   gint width, height;
1163   gint i;
1164
1165   gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2],
1166     &dstRGB, &dst_stride
1167   };
1168
1169   /* conversion step
1170    * STEP 1: create CUtexObject per plane
1171    * STEP 2: call YUV to RGB conversion kernel function.
1172    *         resizing, argb ordering and bitdepth conversion will be performed in
1173    *         the CUDA kernel function
1174    */
1175
1176   /* map CUDA device memory to CUDA texture object */
1177   for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1178     texture[i] = convert_create_texture (convert, src, in_info, i, cuda_stream);
1179     if (!texture[i]) {
1180       GST_ERROR ("couldn't create texture for %d th plane", i);
1181       goto done;
1182     }
1183   }
1184
1185   dstRGB = dst->data;
1186   dst_stride = dst->stride;
1187
1188   width = GST_VIDEO_INFO_WIDTH (out_info);
1189   height = GST_VIDEO_INFO_HEIGHT (out_info);
1190
1191   cuda_ret =
1192       CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X),
1193       DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1194       cuda_stream, kernel_args, NULL);
1195
1196   if (!gst_cuda_result (cuda_ret)) {
1197     GST_ERROR ("could not rescale plane");
1198     goto done;
1199   }
1200
1201   ret = TRUE;
1202   gst_cuda_result (CuStreamSynchronize (cuda_stream));
1203
1204 done:
1205   for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1206     if (texture[i])
1207       gst_cuda_result (CuTexObjectDestroy (texture[i]));
1208   }
1209
1210   return ret;
1211 }
1212
1213 static gboolean
1214 convert_UNPACK_RGB (GstCudaConverter * convert, CUfunction kernel_func,
1215     CUstream cuda_stream, const GstCudaMemory * src, GstVideoInfo * in_info,
1216     CUdeviceptr dst, gint dst_stride, GstCudaRGBOrder * rgb_order)
1217 {
1218   CUdeviceptr srcRGB = 0;
1219   gint width, height;
1220   gint src_stride, src_pstride;
1221   CUresult cuda_ret;
1222
1223   gpointer unpack_kernel_args[] = { &srcRGB, &dst,
1224     &width, &height,
1225     &src_stride, &src_pstride, &dst_stride,
1226     &convert->in_rgb_order.R, &convert->in_rgb_order.G,
1227     &convert->in_rgb_order.B, &convert->in_rgb_order.A,
1228   };
1229
1230   srcRGB = src->data;
1231   src_stride = src->stride;
1232
1233   width = GST_VIDEO_INFO_WIDTH (in_info);
1234   height = GST_VIDEO_INFO_HEIGHT (in_info);
1235   src_pstride = GST_VIDEO_INFO_COMP_PSTRIDE (in_info, 0);
1236
1237   cuda_ret =
1238       CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X),
1239       DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1240       cuda_stream, unpack_kernel_args, NULL);
1241
1242   if (!gst_cuda_result (cuda_ret)) {
1243     GST_ERROR ("could not unpack rgb");
1244     return FALSE;
1245   }
1246
1247   return TRUE;
1248 }
1249
1250 static gboolean
1251 convert_TO_Y444 (GstCudaConverter * convert, CUfunction kernel_func,
1252     CUstream cuda_stream, CUtexObject srcRGB, CUdeviceptr dstY, gint y_stride,
1253     CUdeviceptr dstU, gint u_stride, CUdeviceptr dstV, gint v_stride,
1254     gint width, gint height, gint pstride, gint bitdepth)
1255 {
1256   CUresult cuda_ret;
1257
1258   gpointer kernel_args[] = { &srcRGB, &dstY, &y_stride, &dstU, &u_stride, &dstV,
1259     &v_stride, &width, &height, &pstride, &bitdepth,
1260   };
1261
1262   cuda_ret =
1263       CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X),
1264       DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1265       cuda_stream, kernel_args, NULL);
1266
1267   if (!gst_cuda_result (cuda_ret)) {
1268     GST_ERROR ("could not unpack rgb");
1269     return FALSE;
1270   }
1271
1272   return TRUE;
1273 }
1274
1275 /* main conversion function for RGB to YUV conversion */
1276 static gboolean
1277 convert_RGB_TO_YUV (GstCudaConverter * convert,
1278     const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1279     GstVideoInfo * out_info, CUstream cuda_stream)
1280 {
1281   CUtexObject texture = 0;
1282   CUtexObject yuv_texture[3] = { 0, };
1283   CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, };
1284   CUresult cuda_ret;
1285   gboolean ret = FALSE;
1286   gint in_width, in_height;
1287   gint out_width, out_height;
1288   gint dst_stride;
1289   CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
1290   CUfilter_mode mode = CU_TR_FILTER_MODE_POINT;
1291   gint pstride = 1;
1292   gint bitdepth = 8;
1293   gint i;
1294
1295   gpointer kernel_args[] = { &yuv_texture[0], &yuv_texture[1], &yuv_texture[2],
1296     &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride
1297   };
1298
1299   /* conversion step
1300    * STEP 1: unpack src RGB into ARGB or ARGB64 format
1301    * STEP 2: convert unpacked ARGB (or ARGB64) to Y444 (or Y444_16LE)
1302    * STEP 3: convert Y444 (or Y444_16LE) to final YUV format.
1303    *         resizing, bitdepth conversion, uv reordering will be performed in
1304    *         the CUDA kernel function
1305    */
1306   if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream,
1307           src, in_info, convert->unpack_surface.device_ptr,
1308           convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) {
1309     GST_ERROR ("could not unpack input rgb");
1310
1311     goto done;
1312   }
1313
1314   in_width = GST_VIDEO_INFO_WIDTH (in_info);
1315   in_height = GST_VIDEO_INFO_HEIGHT (in_info);
1316
1317   out_width = GST_VIDEO_INFO_WIDTH (out_info);
1318   out_height = GST_VIDEO_INFO_HEIGHT (out_info);
1319   dst_stride = dst->stride;
1320
1321   if (GST_VIDEO_INFO_COMP_DEPTH (in_info, 0) > 8) {
1322     pstride = 2;
1323     bitdepth = 16;
1324     format = CU_AD_FORMAT_UNSIGNED_INT16;
1325   }
1326
1327   texture =
1328       convert_create_texture_unchecked (convert->unpack_surface.device_ptr,
1329       in_width, in_height, 4, convert->unpack_surface.cuda_stride, format,
1330       mode, cuda_stream);
1331
1332   if (!texture) {
1333     GST_ERROR ("could not create texture");
1334     goto done;
1335   }
1336
1337   if (!convert_TO_Y444 (convert, convert->kernel_func[1], cuda_stream, texture,
1338           convert->y444_surface[0].device_ptr,
1339           convert->y444_surface[0].cuda_stride,
1340           convert->y444_surface[1].device_ptr,
1341           convert->y444_surface[1].cuda_stride,
1342           convert->y444_surface[2].device_ptr,
1343           convert->y444_surface[2].cuda_stride, in_width, in_height, pstride,
1344           bitdepth)) {
1345     GST_ERROR ("could not convert to Y444 or Y444_16LE");
1346     goto done;
1347   }
1348
1349   /* Use h/w linear interpolation only when resize is required.
1350    * Otherwise the image might be blurred */
1351   if (convert->keep_size)
1352     mode = CU_TR_FILTER_MODE_POINT;
1353   else
1354     mode = CU_TR_FILTER_MODE_LINEAR;
1355
1356   for (i = 0; i < 3; i++) {
1357     yuv_texture[i] =
1358         convert_create_texture_unchecked (convert->y444_surface[i].device_ptr,
1359         in_width, in_height, 1, convert->y444_surface[i].cuda_stride, format,
1360         mode, cuda_stream);
1361
1362     if (!yuv_texture[i]) {
1363       GST_ERROR ("could not create %dth yuv texture", i);
1364       goto done;
1365     }
1366   }
1367
1368   for (i = 0; i < GST_VIDEO_INFO_N_PLANES (out_info); i++)
1369     dst_ptr[i] = dst->data + dst->offset[i];
1370
1371   cuda_ret =
1372       CuLaunchKernel (convert->kernel_func[2], DIV_UP (out_width, CUDA_BLOCK_X),
1373       DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1374       cuda_stream, kernel_args, NULL);
1375
1376   if (!gst_cuda_result (cuda_ret)) {
1377     GST_ERROR ("could not rescale plane");
1378     goto done;
1379   }
1380
1381   ret = TRUE;
1382   gst_cuda_result (CuStreamSynchronize (cuda_stream));
1383
1384 done:
1385   if (texture)
1386     gst_cuda_result (CuTexObjectDestroy (texture));
1387   for (i = 0; i < 3; i++) {
1388     if (yuv_texture[i])
1389       gst_cuda_result (CuTexObjectDestroy (yuv_texture[i]));
1390   }
1391
1392   return ret;
1393 }
1394
1395 /* main conversion function for RGB to RGB conversion */
1396 static gboolean
1397 convert_RGB_TO_RGB (GstCudaConverter * convert,
1398     const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1399     GstVideoInfo * out_info, CUstream cuda_stream)
1400 {
1401   CUtexObject texture = 0;
1402   CUresult cuda_ret;
1403   gboolean ret = FALSE;
1404   CUdeviceptr dstRGB = 0;
1405   gint in_width, in_height;
1406   gint out_width, out_height;
1407   gint dst_stride;
1408   CUfilter_mode mode;
1409   CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
1410
1411   gpointer rescale_kernel_args[] = { &texture, &dstRGB, &dst_stride };
1412
1413   /* conversion step
1414    * STEP 1: unpack src RGB into ARGB or ARGB64 format
1415    * STEP 2: convert ARGB (or ARGB64) to final RGB format.
1416    *         resizing, bitdepth conversion, argb reordering will be performed in
1417    *         the CUDA kernel function
1418    */
1419
1420   if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream,
1421           src, in_info, convert->unpack_surface.device_ptr,
1422           convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) {
1423     GST_ERROR ("could not unpack input rgb");
1424
1425     goto done;
1426   }
1427
1428   in_width = GST_VIDEO_INFO_WIDTH (in_info);
1429   in_height = GST_VIDEO_INFO_HEIGHT (in_info);
1430
1431   out_width = GST_VIDEO_INFO_WIDTH (out_info);
1432   out_height = GST_VIDEO_INFO_HEIGHT (out_info);
1433
1434   dstRGB = dst->data;
1435   dst_stride = dst->stride;
1436
1437   if (GST_VIDEO_INFO_COMP_DEPTH (in_info, 0) > 8)
1438     format = CU_AD_FORMAT_UNSIGNED_INT16;
1439
1440   /* Use h/w linear interpolation only when resize is required.
1441    * Otherwise the image might be blurred */
1442   if (convert->keep_size)
1443     mode = CU_TR_FILTER_MODE_POINT;
1444   else
1445     mode = CU_TR_FILTER_MODE_LINEAR;
1446
1447   texture =
1448       convert_create_texture_unchecked (convert->unpack_surface.device_ptr,
1449       in_width, in_height, 4, convert->unpack_surface.cuda_stride, format,
1450       mode, cuda_stream);
1451
1452   if (!texture) {
1453     GST_ERROR ("could not create texture");
1454     goto done;
1455   }
1456
1457   cuda_ret =
1458       CuLaunchKernel (convert->kernel_func[1], DIV_UP (out_width, CUDA_BLOCK_X),
1459       DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1460       cuda_stream, rescale_kernel_args, NULL);
1461
1462   if (!gst_cuda_result (cuda_ret)) {
1463     GST_ERROR ("could not rescale plane");
1464     goto done;
1465   }
1466
1467   ret = TRUE;
1468   gst_cuda_result (CuStreamSynchronize (cuda_stream));
1469
1470 done:
1471   if (texture)
1472     gst_cuda_result (CuTexObjectDestroy (texture));
1473
1474   return ret;
1475 }
1476
1477 /* from video-converter.c */
1478 typedef struct
1479 {
1480   gdouble dm[4][4];
1481 } MatrixData;
1482
1483 static void
1484 color_matrix_set_identity (MatrixData * m)
1485 {
1486   gint i, j;
1487
1488   for (i = 0; i < 4; i++) {
1489     for (j = 0; j < 4; j++) {
1490       m->dm[i][j] = (i == j);
1491     }
1492   }
1493 }
1494
1495 static void
1496 color_matrix_copy (MatrixData * d, const MatrixData * s)
1497 {
1498   gint i, j;
1499
1500   for (i = 0; i < 4; i++)
1501     for (j = 0; j < 4; j++)
1502       d->dm[i][j] = s->dm[i][j];
1503 }
1504
1505 /* Perform 4x4 matrix multiplication:
1506  *  - @dst@ = @a@ * @b@
1507  *  - @dst@ may be a pointer to @a@ andor @b@
1508  */
1509 static void
1510 color_matrix_multiply (MatrixData * dst, MatrixData * a, MatrixData * b)
1511 {
1512   MatrixData tmp;
1513   gint i, j, k;
1514
1515   for (i = 0; i < 4; i++) {
1516     for (j = 0; j < 4; j++) {
1517       gdouble x = 0;
1518       for (k = 0; k < 4; k++) {
1519         x += a->dm[i][k] * b->dm[k][j];
1520       }
1521       tmp.dm[i][j] = x;
1522     }
1523   }
1524   color_matrix_copy (dst, &tmp);
1525 }
1526
1527 static void
1528 color_matrix_offset_components (MatrixData * m, gdouble a1, gdouble a2,
1529     gdouble a3)
1530 {
1531   MatrixData a;
1532
1533   color_matrix_set_identity (&a);
1534   a.dm[0][3] = a1;
1535   a.dm[1][3] = a2;
1536   a.dm[2][3] = a3;
1537   color_matrix_multiply (m, &a, m);
1538 }
1539
1540 static void
1541 color_matrix_scale_components (MatrixData * m, gdouble a1, gdouble a2,
1542     gdouble a3)
1543 {
1544   MatrixData a;
1545
1546   color_matrix_set_identity (&a);
1547   a.dm[0][0] = a1;
1548   a.dm[1][1] = a2;
1549   a.dm[2][2] = a3;
1550   color_matrix_multiply (m, &a, m);
1551 }
1552
1553 static void
1554 color_matrix_debug (const MatrixData * s)
1555 {
1556   GST_DEBUG ("[%f %f %f %f]", s->dm[0][0], s->dm[0][1], s->dm[0][2],
1557       s->dm[0][3]);
1558   GST_DEBUG ("[%f %f %f %f]", s->dm[1][0], s->dm[1][1], s->dm[1][2],
1559       s->dm[1][3]);
1560   GST_DEBUG ("[%f %f %f %f]", s->dm[2][0], s->dm[2][1], s->dm[2][2],
1561       s->dm[2][3]);
1562   GST_DEBUG ("[%f %f %f %f]", s->dm[3][0], s->dm[3][1], s->dm[3][2],
1563       s->dm[3][3]);
1564 }
1565
1566 static void
1567 color_matrix_YCbCr_to_RGB (MatrixData * m, gdouble Kr, gdouble Kb)
1568 {
1569   gdouble Kg = 1.0 - Kr - Kb;
1570   MatrixData k = {
1571     {
1572           {1., 0., 2 * (1 - Kr), 0.},
1573           {1., -2 * Kb * (1 - Kb) / Kg, -2 * Kr * (1 - Kr) / Kg, 0.},
1574           {1., 2 * (1 - Kb), 0., 0.},
1575           {0., 0., 0., 1.},
1576         }
1577   };
1578
1579   color_matrix_multiply (m, &k, m);
1580 }
1581
1582 static void
1583 color_matrix_RGB_to_YCbCr (MatrixData * m, gdouble Kr, gdouble Kb)
1584 {
1585   gdouble Kg = 1.0 - Kr - Kb;
1586   MatrixData k;
1587   gdouble x;
1588
1589   k.dm[0][0] = Kr;
1590   k.dm[0][1] = Kg;
1591   k.dm[0][2] = Kb;
1592   k.dm[0][3] = 0;
1593
1594   x = 1 / (2 * (1 - Kb));
1595   k.dm[1][0] = -x * Kr;
1596   k.dm[1][1] = -x * Kg;
1597   k.dm[1][2] = x * (1 - Kb);
1598   k.dm[1][3] = 0;
1599
1600   x = 1 / (2 * (1 - Kr));
1601   k.dm[2][0] = x * (1 - Kr);
1602   k.dm[2][1] = -x * Kg;
1603   k.dm[2][2] = -x * Kb;
1604   k.dm[2][3] = 0;
1605
1606   k.dm[3][0] = 0;
1607   k.dm[3][1] = 0;
1608   k.dm[3][2] = 0;
1609   k.dm[3][3] = 1;
1610
1611   color_matrix_multiply (m, &k, m);
1612 }
1613
1614 static void
1615 compute_matrix_to_RGB (GstCudaConverter * convert, MatrixData * data,
1616     GstVideoInfo * info)
1617 {
1618   gdouble Kr = 0, Kb = 0;
1619   gint offset[4], scale[4];
1620
1621   /* bring color components to [0..1.0] range */
1622   gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset,
1623       scale);
1624
1625   color_matrix_offset_components (data, -offset[0], -offset[1], -offset[2]);
1626   color_matrix_scale_components (data, 1 / ((float) scale[0]),
1627       1 / ((float) scale[1]), 1 / ((float) scale[2]));
1628
1629   if (!GST_VIDEO_INFO_IS_RGB (info)) {
1630     /* bring components to R'G'B' space */
1631     if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb))
1632       color_matrix_YCbCr_to_RGB (data, Kr, Kb);
1633   }
1634   color_matrix_debug (data);
1635 }
1636
1637 static void
1638 compute_matrix_to_YUV (GstCudaConverter * convert, MatrixData * data,
1639     GstVideoInfo * info)
1640 {
1641   gdouble Kr = 0, Kb = 0;
1642   gint offset[4], scale[4];
1643
1644   if (!GST_VIDEO_INFO_IS_RGB (info)) {
1645     /* bring components to YCbCr space */
1646     if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb))
1647       color_matrix_RGB_to_YCbCr (data, Kr, Kb);
1648   }
1649
1650   /* bring color components to nominal range */
1651   gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset,
1652       scale);
1653
1654   color_matrix_scale_components (data, (float) scale[0], (float) scale[1],
1655       (float) scale[2]);
1656   color_matrix_offset_components (data, offset[0], offset[1], offset[2]);
1657
1658   color_matrix_debug (data);
1659 }
1660
1661 static gboolean
1662 cuda_converter_get_matrix (GstCudaConverter * convert, MatrixData * matrix,
1663     GstVideoInfo * in_info, GstVideoInfo * out_info)
1664 {
1665   gboolean same_matrix, same_bits;
1666   guint in_bits, out_bits;
1667
1668   in_bits = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0);
1669   out_bits = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0);
1670
1671   same_bits = in_bits == out_bits;
1672   same_matrix = in_info->colorimetry.matrix == out_info->colorimetry.matrix;
1673
1674   GST_DEBUG ("matrix %d -> %d (%d)", in_info->colorimetry.matrix,
1675       out_info->colorimetry.matrix, same_matrix);
1676   GST_DEBUG ("bits %d -> %d (%d)", in_bits, out_bits, same_bits);
1677
1678   color_matrix_set_identity (matrix);
1679
1680   if (same_bits && same_matrix) {
1681     GST_DEBUG ("conversion matrix is not required");
1682
1683     return FALSE;
1684   }
1685
1686   if (in_bits < out_bits) {
1687     gint scale = 1 << (out_bits - in_bits);
1688     color_matrix_scale_components (matrix,
1689         1 / (float) scale, 1 / (float) scale, 1 / (float) scale);
1690   }
1691
1692   GST_DEBUG ("to RGB matrix");
1693   compute_matrix_to_RGB (convert, matrix, in_info);
1694   GST_DEBUG ("current matrix");
1695   color_matrix_debug (matrix);
1696
1697   GST_DEBUG ("to YUV matrix");
1698   compute_matrix_to_YUV (convert, matrix, out_info);
1699   GST_DEBUG ("current matrix");
1700   color_matrix_debug (matrix);
1701
1702   if (in_bits > out_bits) {
1703     gint scale = 1 << (in_bits - out_bits);
1704     color_matrix_scale_components (matrix,
1705         (float) scale, (float) scale, (float) scale);
1706   }
1707
1708   GST_DEBUG ("final matrix");
1709   color_matrix_debug (matrix);
1710
1711   return TRUE;
1712 }
1713
1714 static gboolean
1715 is_uv_swapped (GstVideoFormat format)
1716 {
1717   static GstVideoFormat swapped_formats[] = {
1718     GST_VIDEO_FORMAT_YV12,
1719     GST_VIDEO_FORMAT_NV21,
1720   };
1721   gint i;
1722
1723   for (i = 0; i < G_N_ELEMENTS (swapped_formats); i++) {
1724     if (format == swapped_formats[i])
1725       return TRUE;
1726   }
1727
1728   return FALSE;
1729 }
1730
1731 typedef struct
1732 {
1733   const gchar *read_chroma;
1734   const gchar *write_chroma;
1735   const gchar *unpack_function;
1736   gfloat scale_h, scale_v;
1737   gfloat chroma_scale_h, chroma_scale_v;
1738   gint width, height;
1739   gint chroma_width, chroma_height;
1740   gint in_depth;
1741   gint out_depth;
1742   gint pstride, chroma_pstride;
1743   gint in_shift, out_shift;
1744   gint mask;
1745   gint swap_uv;
1746   /* RGBA specific variables */
1747   gint max_in_val;
1748   GstCudaRGBOrder rgb_order;
1749 } GstCudaKernelTempl;
1750
1751 static gchar *
1752 cuda_converter_generate_yuv_to_yuv_kernel_code (GstCudaConverter * convert,
1753     GstCudaKernelTempl * templ)
1754 {
1755   return g_strdup_printf (templ_YUV_TO_YUV,
1756       templ->scale_h, templ->scale_v, templ->chroma_scale_h,
1757       templ->chroma_scale_v, templ->width, templ->height, templ->chroma_width,
1758       templ->chroma_height, templ->in_depth, templ->out_depth, templ->pstride,
1759       templ->chroma_pstride, templ->in_shift, templ->out_shift, templ->mask,
1760       templ->swap_uv, templ->read_chroma, templ->write_chroma);
1761 }
1762
1763 static gchar *
1764 cuda_converter_generate_yuv_to_rgb_kernel_code (GstCudaConverter * convert,
1765     GstCudaKernelTempl * templ, MatrixData * matrix)
1766 {
1767   return g_strdup_printf (templ_YUV_TO_RGB,
1768       matrix->dm[0][3], matrix->dm[1][3], matrix->dm[2][3],
1769       matrix->dm[0][0], matrix->dm[0][1], matrix->dm[0][2],
1770       matrix->dm[1][0], matrix->dm[1][1], matrix->dm[1][2],
1771       matrix->dm[2][0], matrix->dm[2][1], matrix->dm[2][2],
1772       templ->scale_h, templ->scale_v, templ->chroma_scale_h,
1773       templ->chroma_scale_v, templ->width, templ->height, templ->chroma_width,
1774       templ->chroma_height, templ->in_depth, templ->out_depth, templ->pstride,
1775       templ->chroma_pstride, templ->in_shift, templ->out_shift, templ->mask,
1776       templ->swap_uv, templ->max_in_val, templ->rgb_order.R,
1777       templ->rgb_order.G, templ->rgb_order.B, templ->rgb_order.A,
1778       templ->rgb_order.X, templ->read_chroma);
1779 }
1780
1781 static gchar *
1782 cuda_converter_generate_rgb_to_yuv_kernel_code (GstCudaConverter * convert,
1783     GstCudaKernelTempl * templ, MatrixData * matrix)
1784 {
1785   return g_strdup_printf (templ_RGB_TO_YUV,
1786       matrix->dm[0][3], matrix->dm[1][3], matrix->dm[2][3],
1787       matrix->dm[0][0], matrix->dm[0][1], matrix->dm[0][2],
1788       matrix->dm[1][0], matrix->dm[1][1], matrix->dm[1][2],
1789       matrix->dm[2][0], matrix->dm[2][1], matrix->dm[2][2],
1790       templ->scale_h, templ->scale_v, templ->chroma_scale_h,
1791       templ->chroma_scale_v, templ->width, templ->height, templ->chroma_width,
1792       templ->chroma_height, templ->in_depth, templ->out_depth, templ->pstride,
1793       templ->chroma_pstride, templ->in_shift, templ->out_shift, templ->mask,
1794       templ->swap_uv, templ->unpack_function, templ->read_chroma,
1795       templ->write_chroma);
1796 }
1797
1798 static gchar *
1799 cuda_converter_generate_rgb_to_rgb_kernel_code (GstCudaConverter * convert,
1800     GstCudaKernelTempl * templ)
1801 {
1802   return g_strdup_printf (templ_RGB_to_RGB,
1803       templ->scale_h, templ->scale_v,
1804       templ->width, templ->height,
1805       templ->in_depth, templ->out_depth, templ->pstride,
1806       templ->rgb_order.R, templ->rgb_order.G,
1807       templ->rgb_order.B, templ->rgb_order.A, templ->rgb_order.X,
1808       templ->unpack_function);
1809 }
1810
1811 #define SET_ORDER(o,r,g,b,a,x) G_STMT_START { \
1812   (o)->R = (r); \
1813   (o)->G = (g); \
1814   (o)->B = (b); \
1815   (o)->A = (a); \
1816   (o)->X = (x); \
1817 } G_STMT_END
1818
1819 static void
1820 cuda_converter_get_rgb_order (GstVideoFormat format, GstCudaRGBOrder * order)
1821 {
1822   switch (format) {
1823     case GST_VIDEO_FORMAT_RGBA:
1824       SET_ORDER (order, 0, 1, 2, 3, -1);
1825       break;
1826     case GST_VIDEO_FORMAT_RGBx:
1827       SET_ORDER (order, 0, 1, 2, -1, 3);
1828       break;
1829     case GST_VIDEO_FORMAT_BGRA:
1830       SET_ORDER (order, 2, 1, 0, 3, -1);
1831       break;
1832     case GST_VIDEO_FORMAT_BGRx:
1833       SET_ORDER (order, 2, 1, 0, -1, 3);
1834       break;
1835     case GST_VIDEO_FORMAT_ARGB:
1836       SET_ORDER (order, 1, 2, 3, 0, -1);
1837       break;
1838     case GST_VIDEO_FORMAT_ABGR:
1839       SET_ORDER (order, 3, 2, 1, 0, -1);
1840       break;
1841     case GST_VIDEO_FORMAT_RGB:
1842       SET_ORDER (order, 0, 1, 2, -1, -1);
1843       break;
1844     case GST_VIDEO_FORMAT_BGR:
1845       SET_ORDER (order, 2, 1, 0, -1, -1);
1846       break;
1847     case GST_VIDEO_FORMAT_BGR10A2_LE:
1848       SET_ORDER (order, 1, 2, 3, 0, -1);
1849       break;
1850     case GST_VIDEO_FORMAT_RGB10A2_LE:
1851       SET_ORDER (order, 3, 2, 1, 0, -1);
1852       break;
1853     default:
1854       g_assert_not_reached ();
1855       break;
1856   }
1857 }
1858
1859 static gboolean
1860 cuda_converter_lookup_path (GstCudaConverter * convert)
1861 {
1862   GstVideoFormat in_format, out_format;
1863   gboolean src_yuv, dst_yuv;
1864   gboolean src_planar, dst_planar;
1865   GstCudaKernelTempl templ = { 0, };
1866   GstVideoInfo *in_info, *out_info;
1867   gboolean ret = FALSE;
1868   CUresult cuda_ret;
1869
1870   in_info = &convert->in_info;
1871   out_info = &convert->out_info;
1872
1873   in_format = GST_VIDEO_INFO_FORMAT (in_info);
1874   out_format = GST_VIDEO_INFO_FORMAT (out_info);
1875
1876   src_yuv = GST_VIDEO_INFO_IS_YUV (in_info);
1877   dst_yuv = GST_VIDEO_INFO_IS_YUV (out_info);
1878
1879   src_planar = GST_VIDEO_INFO_N_PLANES (in_info) ==
1880       GST_VIDEO_INFO_N_COMPONENTS (in_info);
1881   dst_planar = GST_VIDEO_INFO_N_PLANES (out_info) ==
1882       GST_VIDEO_INFO_N_COMPONENTS (out_info);
1883
1884   convert->keep_size = (GST_VIDEO_INFO_WIDTH (&convert->in_info) ==
1885       GST_VIDEO_INFO_WIDTH (&convert->out_info) &&
1886       GST_VIDEO_INFO_HEIGHT (&convert->in_info) ==
1887       GST_VIDEO_INFO_HEIGHT (&convert->out_info));
1888
1889   templ.scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 0) /
1890       (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 0);
1891   templ.scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 0) /
1892       (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0);
1893   templ.chroma_scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 1) /
1894       (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 1);
1895   templ.chroma_scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 1) /
1896       (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1);
1897   templ.width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 0);
1898   templ.height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0);
1899   templ.chroma_width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 1);
1900   templ.chroma_height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1);
1901
1902   templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0);
1903   templ.out_depth = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0);
1904   templ.pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 0);
1905   templ.chroma_pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 1);
1906   templ.in_shift = in_info->finfo->shift[0];
1907   templ.out_shift = out_info->finfo->shift[0];
1908   templ.mask = ((1 << templ.out_depth) - 1) << templ.out_shift;
1909   templ.swap_uv = (is_uv_swapped (in_format) != is_uv_swapped (out_format));
1910
1911   if (src_yuv && dst_yuv) {
1912     convert->convert = convert_YUV_TO_YUV;
1913
1914     if (src_planar && dst_planar) {
1915       templ.read_chroma = READ_CHROMA_FROM_PLANAR;
1916       templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
1917     } else if (!src_planar && dst_planar) {
1918       templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
1919       templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
1920     } else if (src_planar && !dst_planar) {
1921       templ.read_chroma = READ_CHROMA_FROM_PLANAR;
1922       templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
1923     } else {
1924       templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
1925       templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
1926     }
1927
1928     convert->kernel_source =
1929         cuda_converter_generate_yuv_to_yuv_kernel_code (convert, &templ);
1930     convert->func_names[0] = GST_CUDA_KERNEL_FUNC;
1931
1932     ret = TRUE;
1933   } else if (src_yuv && !dst_yuv) {
1934     MatrixData matrix;
1935
1936     if (src_planar) {
1937       templ.read_chroma = READ_CHROMA_FROM_PLANAR;
1938     } else {
1939       templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
1940     }
1941
1942     templ.max_in_val = (1 << templ.in_depth) - 1;
1943     cuda_converter_get_rgb_order (out_format, &templ.rgb_order);
1944
1945     cuda_converter_get_matrix (convert, &matrix, in_info, out_info);
1946     convert->kernel_source =
1947         cuda_converter_generate_yuv_to_rgb_kernel_code (convert,
1948         &templ, &matrix);
1949     convert->func_names[0] = GST_CUDA_KERNEL_FUNC;
1950
1951     convert->convert = convert_YUV_TO_RGB;
1952
1953     ret = TRUE;
1954   } else if (!src_yuv && dst_yuv) {
1955     MatrixData matrix;
1956     gsize element_size = 8;
1957     GstVideoFormat unpack_format;
1958     GstVideoFormat y444_format;
1959     GstVideoInfo unpack_info;
1960     GstVideoInfo y444_info;
1961     gint i;
1962
1963     if (dst_planar) {
1964       templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
1965     } else {
1966       templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
1967     }
1968     templ.read_chroma = READ_CHROMA_FROM_PLANAR;
1969
1970     cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order);
1971
1972     if (templ.in_depth > 8) {
1973       /* FIXME: RGB10A2_LE and BGR10A2_LE only */
1974       element_size = 16;
1975       unpack_format = GST_VIDEO_FORMAT_ARGB64;
1976       y444_format = GST_VIDEO_FORMAT_Y444_16LE;
1977       templ.unpack_function = unpack_to_ARGB64;
1978     } else {
1979       unpack_format = GST_VIDEO_FORMAT_ARGB;
1980       y444_format = GST_VIDEO_FORMAT_Y444;
1981       templ.unpack_function = unpack_to_ARGB;
1982     }
1983
1984     gst_video_info_set_format (&unpack_info,
1985         unpack_format, GST_VIDEO_INFO_WIDTH (in_info),
1986         GST_VIDEO_INFO_HEIGHT (in_info));
1987     gst_video_info_set_format (&y444_info,
1988         y444_format, GST_VIDEO_INFO_WIDTH (in_info),
1989         GST_VIDEO_INFO_HEIGHT (in_info));
1990
1991     templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0);
1992
1993     cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr,
1994         &convert->unpack_surface.cuda_stride,
1995         GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) *
1996         GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0),
1997         GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size);
1998
1999     if (!gst_cuda_result (cuda_ret)) {
2000       GST_ERROR ("couldn't alloc unpack surface");
2001       return FALSE;
2002     }
2003
2004     for (i = 0; i < 3; i++) {
2005       cuda_ret = CuMemAllocPitch (&convert->y444_surface[i].device_ptr,
2006           &convert->y444_surface[i].cuda_stride,
2007           GST_VIDEO_INFO_COMP_WIDTH (&y444_info, i) *
2008           GST_VIDEO_INFO_COMP_PSTRIDE (&y444_info, i),
2009           GST_VIDEO_INFO_COMP_HEIGHT (&y444_info, i), element_size);
2010
2011       if (!gst_cuda_result (cuda_ret)) {
2012         GST_ERROR ("couldn't alloc %dth y444 surface", i);
2013         return FALSE;
2014       }
2015     }
2016
2017     cuda_converter_get_matrix (convert, &matrix, &unpack_info, &y444_info);
2018
2019     convert->kernel_source =
2020         cuda_converter_generate_rgb_to_yuv_kernel_code (convert,
2021         &templ, &matrix);
2022
2023     convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB;
2024     convert->func_names[1] = GST_CUDA_KERNEL_FUNC_TO_Y444;
2025     convert->func_names[2] = GST_CUDA_KERNEL_FUNC_Y444_TO_YUV;
2026
2027     convert->convert = convert_RGB_TO_YUV;
2028
2029     ret = TRUE;
2030   } else {
2031     gsize element_size = 8;
2032     GstVideoFormat unpack_format;
2033     GstVideoInfo unpack_info;
2034
2035     cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order);
2036     cuda_converter_get_rgb_order (out_format, &templ.rgb_order);
2037
2038     if (templ.in_depth > 8) {
2039       /* FIXME: RGB10A2_LE and BGR10A2_LE only */
2040       element_size = 16;
2041       unpack_format = GST_VIDEO_FORMAT_ARGB64;
2042       templ.unpack_function = unpack_to_ARGB64;
2043     } else {
2044       unpack_format = GST_VIDEO_FORMAT_ARGB;
2045       templ.unpack_function = unpack_to_ARGB;
2046     }
2047
2048     gst_video_info_set_format (&unpack_info,
2049         unpack_format, GST_VIDEO_INFO_WIDTH (in_info),
2050         GST_VIDEO_INFO_HEIGHT (in_info));
2051
2052     templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0);
2053
2054     cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr,
2055         &convert->unpack_surface.cuda_stride,
2056         GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) *
2057         GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0),
2058         GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size);
2059
2060     if (!gst_cuda_result (cuda_ret)) {
2061       GST_ERROR ("couldn't alloc unpack surface");
2062       return FALSE;
2063     }
2064
2065     convert->kernel_source =
2066         cuda_converter_generate_rgb_to_rgb_kernel_code (convert, &templ);
2067
2068     convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB;
2069     convert->func_names[1] = GST_CUDA_KERNEL_FUNC_SCALE_RGB;
2070
2071     convert->convert = convert_RGB_TO_RGB;
2072
2073     ret = TRUE;
2074   }
2075
2076   if (!ret) {
2077     GST_DEBUG ("no path found");
2078
2079     return FALSE;
2080   }
2081
2082   GST_TRACE ("configured CUDA kernel source\n%s", convert->kernel_source);
2083
2084   return TRUE;
2085 }