2 * Copyright (C) 2010 David Schleef <ds@schleef.org>
3 * Copyright (C) 2010 Sebastian Dröge <sebastian.droege@collabora.co.uk>
4 * Copyright (C) 2019 Seungha Yang <seungha.yang@navercorp.com>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
16 * You should have received a copy of the GNU Library General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
23 * SECTION:cudaconverter
24 * @title: GstCudaConverter
25 * @short_description: Generic video conversion using CUDA
27 * This object is used to convert video frames from one format to another.
28 * The object can perform conversion of:
37 * * Add more interpolation method and make it selectable,
38 * currently default bi-linear interpolation only
39 * * Add fast-path for conversion like videoconvert
40 * * Full colorimetry and chroma-siting support
41 * * cropping, and x, y position support
48 #include "cuda-converter.h"
49 #include "gstcudautils.h"
50 #include "gstcudaloader.h"
51 #include "gstcudanvrtc.h"
54 #define CUDA_BLOCK_X 16
55 #define CUDA_BLOCK_Y 16
56 #define DIV_UP(size,block) (((size) + ((block) - 1)) / (block))
58 static gboolean cuda_converter_lookup_path (GstCudaConverter * convert);
60 #ifndef GST_DISABLE_GST_DEBUG
61 #define GST_CAT_DEFAULT ensure_debug_category()
62 static GstDebugCategory *
63 ensure_debug_category (void)
65 static gsize cat_gonce = 0;
67 if (g_once_init_enter (&cat_gonce)) {
70 cat_done = (gsize) _gst_debug_category_new ("cuda-converter", 0,
71 "cuda-converter object");
73 g_once_init_leave (&cat_gonce, cat_done);
76 return (GstDebugCategory *) cat_gonce;
79 #define ensure_debug_category()
82 #define GST_CUDA_KERNEL_FUNC "gst_cuda_kernel_func"
84 #define GST_CUDA_KERNEL_FUNC_TO_Y444 "gst_cuda_kernel_func_to_y444"
86 #define GST_CUDA_KERNEL_FUNC_Y444_TO_YUV "gst_cuda_kernel_func_y444_to_yuv"
88 #define GST_CUDA_KERNEL_FUNC_TO_ARGB "gst_cuda_kernel_func_to_argb"
90 #define GST_CUDA_KERNEL_FUNC_SCALE_RGB "gst_cuda_kernel_func_scale_rgb"
95 * @tex1: a CUDA texture object representing a semi-planar chroma plane
97 * @x: the x coordinate to read data from @tex1
98 * @y: the y coordinate to read data from @tex1
100 * Returns: a #ushort2 vector representing both chroma pixel values
102 static const gchar READ_CHROMA_FROM_SEMI_PLANAR[] =
103 "__device__ ushort2\n"
104 "read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n"
105 " float x, float y)\n"
107 " return tex2D<ushort2>(tex1, x, y);\n"
112 * @tex1: a CUDA texture object representing a chroma planar plane
113 * @tex2: a CUDA texture object representing the other planar plane
114 * @x: the x coordinate to read data from @tex1 and @tex2
115 * @y: the y coordinate to read data from @tex1 and @tex2
117 * Returns: a #ushort2 vector representing both chroma pixel values
119 static const gchar READ_CHROMA_FROM_PLANAR[] =
120 "__device__ ushort2\n"
121 "read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n"
122 " float x, float y)\n"
124 " unsigned short u, v;\n"
125 " u = tex2D<unsigned short>(tex1, x, y);\n"
126 " v = tex2D<unsigned short>(tex2, x, y);\n"
127 " return make_ushort2(u, v);\n"
132 * @dst1: a CUDA global memory pointing to a semi-planar chroma plane
134 * @u: a pixel value to write @dst1
135 * @v: a pixel value to write @dst1
136 * @x: the x coordinate to write data into @tex1
137 * @x: the y coordinate to write data into @tex1
138 * @pstride: the pixel stride of @dst1
139 * @mask: bitmask to be applied to high bitdepth plane
141 * Write @u and @v pixel value to @dst1 semi-planar plane
143 static const gchar WRITE_CHROMA_TO_SEMI_PLANAR[] =
145 "write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n"
146 " unsigned short v, int x, int y, int pstride, int stride, int mask)\n"
148 " if (OUT_DEPTH > 8) {\n"
149 " *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n"
150 " *(unsigned short *)&dst1[x * pstride + 2 + y * stride] = (v & mask);\n"
152 " dst1[x * pstride + y * stride] = u;\n"
153 " dst1[x * pstride + 1 + y * stride] = v;\n"
159 * @dst1: a CUDA global memory pointing to a planar chroma plane
160 * @dst2: a CUDA global memory pointing to a the other planar chroma plane
161 * @u: a pixel value to write @dst1
162 * @v: a pixel value to write @dst1
163 * @x: the x coordinate to write data into @tex1
164 * @x: the y coordinate to write data into @tex1
165 * @pstride: the pixel stride of @dst1
166 * @mask: bitmask to be applied to high bitdepth plane
168 * Write @u and @v pixel value into @dst1 and @dst2 planar planes
170 static const gchar WRITE_CHROMA_TO_PLANAR[] =
172 "write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n"
173 " unsigned short v, int x, int y, int pstride, int stride, int mask)\n"
175 " if (OUT_DEPTH > 8) {\n"
176 " *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n"
177 " *(unsigned short *)&dst2[x * pstride + y * stride] = (v & mask);\n"
179 " dst1[x * pstride + y * stride] = u;\n"
180 " dst2[x * pstride + y * stride] = v;\n"
184 /* CUDA kernel source for from YUV to YUV conversion and scale */
185 static const gchar templ_YUV_TO_YUV[] =
187 "__constant__ float SCALE_H = %f;\n"
188 "__constant__ float SCALE_V = %f;\n"
189 "__constant__ float CHROMA_SCALE_H = %f;\n"
190 "__constant__ float CHROMA_SCALE_V = %f;\n"
191 "__constant__ int WIDTH = %d;\n"
192 "__constant__ int HEIGHT = %d;\n"
193 "__constant__ int CHROMA_WIDTH = %d;\n"
194 "__constant__ int CHROMA_HEIGHT = %d;\n"
195 "__constant__ int IN_DEPTH = %d;\n"
196 "__constant__ int OUT_DEPTH = %d;\n"
197 "__constant__ int PSTRIDE = %d;\n"
198 "__constant__ int CHROMA_PSTRIDE = %d;\n"
199 "__constant__ int IN_SHIFT = %d;\n"
200 "__constant__ int OUT_SHIFT = %d;\n"
201 "__constant__ int MASK = %d;\n"
202 "__constant__ int SWAP_UV = %d;\n"
204 "__device__ unsigned short\n"
205 "do_scale_pixel (unsigned short val) \n"
207 " unsigned int diff;\n"
208 " if (OUT_DEPTH > IN_DEPTH) {\n"
209 " diff = OUT_DEPTH - IN_DEPTH;\n"
210 " return (val << diff) | (val >> (IN_DEPTH - diff));\n"
211 " } else if (IN_DEPTH > OUT_DEPTH) {\n"
212 " return val >> (IN_DEPTH - OUT_DEPTH);\n"
217 /* __device__ ushort2
218 * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
223 * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,
224 * unsigned short v, int x, int y, int pstride, int stride, int mask);
230 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
231 " unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n"
234 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
235 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
236 " if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
237 " float src_xpos = SCALE_H * x_pos;\n"
238 " float src_ypos = SCALE_V * y_pos;\n"
239 " unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
240 " y = y >> IN_SHIFT;\n"
241 " y = do_scale_pixel (y);\n"
242 " y = y << OUT_SHIFT;\n"
243 " if (OUT_DEPTH > 8) {\n"
244 " *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n"
246 " dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n"
249 " if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n"
250 " float src_xpos = CHROMA_SCALE_H * x_pos;\n"
251 " float src_ypos = CHROMA_SCALE_V * y_pos;\n"
252 " unsigned short u, v;\n"
253 " ushort2 uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
256 " u = u >> IN_SHIFT;\n"
257 " v = v >> IN_SHIFT;\n"
258 " u = do_scale_pixel (u);\n"
259 " v = do_scale_pixel (v);\n"
260 " u = u << OUT_SHIFT;\n"
261 " v = v << OUT_SHIFT;\n"
263 " unsigned short tmp = u;\n"
267 " write_chroma (dst1,\n"
268 " dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, stride, MASK);\n"
274 /* CUDA kernel source for from YUV to RGB conversion and scale */
275 static const gchar templ_YUV_TO_RGB[] =
277 "__constant__ float offset[3] = {%f, %f, %f};\n"
278 "__constant__ float rcoeff[3] = {%f, %f, %f};\n"
279 "__constant__ float gcoeff[3] = {%f, %f, %f};\n"
280 "__constant__ float bcoeff[3] = {%f, %f, %f};\n"
282 "__constant__ float SCALE_H = %f;\n"
283 "__constant__ float SCALE_V = %f;\n"
284 "__constant__ float CHROMA_SCALE_H = %f;\n"
285 "__constant__ float CHROMA_SCALE_V = %f;\n"
286 "__constant__ int WIDTH = %d;\n"
287 "__constant__ int HEIGHT = %d;\n"
288 "__constant__ int CHROMA_WIDTH = %d;\n"
289 "__constant__ int CHROMA_HEIGHT = %d;\n"
290 "__constant__ int IN_DEPTH = %d;\n"
291 "__constant__ int OUT_DEPTH = %d;\n"
292 "__constant__ int PSTRIDE = %d;\n"
293 "__constant__ int CHROMA_PSTRIDE = %d;\n"
294 "__constant__ int IN_SHIFT = %d;\n"
295 "__constant__ int OUT_SHIFT = %d;\n"
296 "__constant__ int MASK = %d;\n"
297 "__constant__ int SWAP_UV = %d;\n"
298 "__constant__ int MAX_IN_VAL = %d;\n"
299 "__constant__ int R_IDX = %d;\n"
300 "__constant__ int G_IDX = %d;\n"
301 "__constant__ int B_IDX = %d;\n"
302 "__constant__ int A_IDX = %d;\n"
303 "__constant__ int X_IDX = %d;\n"
305 "__device__ unsigned short\n"
306 "do_scale_pixel (unsigned short val) \n"
308 " unsigned int diff;\n"
309 " if (OUT_DEPTH > IN_DEPTH) {\n"
310 " diff = OUT_DEPTH - IN_DEPTH;\n"
311 " return (val << diff) | (val >> (IN_DEPTH - diff));\n"
312 " } else if (IN_DEPTH > OUT_DEPTH) {\n"
313 " return val >> (IN_DEPTH - OUT_DEPTH);\n"
319 "dot(float3 val, float *coeff)\n"
321 " return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n"
325 "yuv_to_rgb (unsigned short y, unsigned short u, unsigned short v, unsigned int max_val)\n"
327 " float3 yuv = make_float3 (y, u, v);\n"
329 " rgb.x = max ((unsigned int)(dot (yuv, rcoeff) + offset[0]), 0);\n"
330 " rgb.y = max ((unsigned int)(dot (yuv, gcoeff) + offset[1]), 0);\n"
331 " rgb.z = max ((unsigned int)(dot (yuv, bcoeff) + offset[2]), 0);\n"
332 " rgb.x = min (rgb.x, max_val);\n"
333 " rgb.y = min (rgb.y, max_val);\n"
334 " rgb.z = min (rgb.z, max_val);\n"
338 /* __device__ ushort2
339 * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
345 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
346 " unsigned char *dstRGB, int stride)\n"
348 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
349 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
350 " if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
351 " float src_xpos = SCALE_H * x_pos;\n"
352 " float src_ypos = SCALE_V * y_pos;\n"
353 " unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
355 " unsigned short u, v;\n"
357 " unsigned int clip_max = MAX_IN_VAL;\n"
358 " src_xpos = CHROMA_SCALE_H * x_pos;\n"
359 " src_ypos = CHROMA_SCALE_V * y_pos;\n"
360 " uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
363 " y = y >> IN_SHIFT;\n"
364 " u = u >> IN_SHIFT;\n"
365 " v = v >> IN_SHIFT;\n"
367 " unsigned short tmp = u;\n"
371 /* conversion matrix is scaled to higher bitdepth between in/out formats */
372 " if (OUT_DEPTH > IN_DEPTH) {\n"
373 " y = do_scale_pixel (y);\n"
374 " u = do_scale_pixel (u);\n"
375 " v = do_scale_pixel (v);\n"
376 " clip_max = MASK;\n"
378 " rgb = yuv_to_rgb (y, u, v, clip_max);\n"
379 " if (OUT_DEPTH < IN_DEPTH) {\n"
380 " rgb.x = do_scale_pixel (rgb.x);\n"
381 " rgb.y = do_scale_pixel (rgb.y);\n"
382 " rgb.z = do_scale_pixel (rgb.z);\n"
384 " if (OUT_DEPTH > 8) {\n"
385 " unsigned int packed_rgb = 0;\n"
386 /* A is always MSB, we support only little endian system */
387 " packed_rgb = 0xc000 << 16;\n"
388 " packed_rgb |= (rgb.x << (30 - (R_IDX * 10)));\n"
389 " packed_rgb |= (rgb.y << (30 - (G_IDX * 10)));\n"
390 " packed_rgb |= (rgb.z << (30 - (B_IDX * 10)));\n"
391 " *(unsigned int *)&dstRGB[x_pos * PSTRIDE + y_pos * stride] = packed_rgb;\n"
393 " dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * stride] = (unsigned char) rgb.x;\n"
394 " dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * stride] = (unsigned char) rgb.y;\n"
395 " dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * stride] = (unsigned char) rgb.z;\n"
396 " if (A_IDX >= 0 || X_IDX >= 0)\n"
397 " dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * stride] = 0xff;\n"
405 * GST_CUDA_KERNEL_FUNC_TO_ARGB:
406 * @srcRGB: a CUDA global memory containing a RGB image
407 * @dstRGB: a CUDA global memory to store unpacked ARGB image
408 * @width: the width of @srcRGB and @dstRGB
409 * @height: the height of @srcRGB and @dstRGB
410 * @src_stride: the stride of @srcRGB
411 * @src_pstride: the pixel stride of @srcRGB
412 * @dst_stride: the stride of @dstRGB
413 * @r_idx: the index of red component of @srcRGB
414 * @g_idx: the index of green component of @srcRGB
415 * @b_idx: the index of blue component of @srcRGB
416 * @a_idx: the index of alpha component of @srcRGB
418 * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB
420 static const gchar unpack_to_ARGB[] =
422 GST_CUDA_KERNEL_FUNC_TO_ARGB
423 "(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n"
424 " int src_stride, int src_pstride, int dst_stride,\n"
425 " int r_idx, int g_idx, int b_idx, int a_idx)\n"
427 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
428 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
429 " if (x_pos < width && y_pos < height) {\n"
430 " if (a_idx >= 0) {\n"
431 " dstRGB[x_pos * 4 + y_pos * dst_stride] =\n"
432 " srcRGB[x_pos * src_pstride + a_idx + y_pos * src_stride];\n"
434 " dstRGB[x_pos * 4 + y_pos * dst_stride] = 0xff;\n"
436 " dstRGB[x_pos * 4 + 1 + y_pos * dst_stride] =\n"
437 " srcRGB[x_pos * src_pstride + r_idx + y_pos * src_stride];\n"
438 " dstRGB[x_pos * 4 + 2 + y_pos * dst_stride] =\n"
439 " srcRGB[x_pos * src_pstride + g_idx + y_pos * src_stride];\n"
440 " dstRGB[x_pos * 4 + 3 + y_pos * dst_stride] =\n"
441 " srcRGB[x_pos * src_pstride + b_idx + y_pos * src_stride];\n"
446 * GST_CUDA_KERNEL_FUNC_TO_ARGB:
447 * @srcRGB: a CUDA global memory containing a RGB image
448 * @dstRGB: a CUDA global memory to store unpacked ARGB64 image
449 * @width: the width of @srcRGB and @dstRGB
450 * @height: the height of @srcRGB and @dstRGB
451 * @src_stride: the stride of @srcRGB
452 * @src_pstride: the pixel stride of @srcRGB
453 * @dst_stride: the stride of @dstRGB
454 * @r_idx: the index of red component of @srcRGB
455 * @g_idx: the index of green component of @srcRGB
456 * @b_idx: the index of blue component of @srcRGB
457 * @a_idx: the index of alpha component of @srcRGB
459 * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB
461 static const gchar unpack_to_ARGB64[] =
463 GST_CUDA_KERNEL_FUNC_TO_ARGB
464 "(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n"
465 " int src_stride, int src_pstride, int dst_stride,\n"
466 " int r_idx, int g_idx, int b_idx, int a_idx)\n"
468 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
469 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
470 " if (x_pos < width && y_pos < height) {\n"
471 " unsigned short a, r, g, b;\n"
472 " unsigned int read_val;\n"
473 " read_val = *(unsigned int *)&srcRGB[x_pos * src_pstride + y_pos * src_stride];\n"
474 " a = (read_val >> 30) & 0x03;\n"
475 " a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
476 " r = ((read_val >> (30 - (r_idx * 10))) & 0x3ff);\n"
477 " r = (r << 6) | (r >> 4);\n"
478 " g = ((read_val >> (30 - (g_idx * 10))) & 0x3ff);\n"
479 " g = (g << 6) | (g >> 4);\n"
480 " b = ((read_val >> (30 - (b_idx * 10))) & 0x3ff);\n"
481 " b = (b << 6) | (b >> 4);\n"
482 " *(unsigned short *)&dstRGB[x_pos * 8 + y_pos * dst_stride] = 0xffff;\n"
483 " *(unsigned short *)&dstRGB[x_pos * 8 + 2 + y_pos * dst_stride] = r;\n"
484 " *(unsigned short *)&dstRGB[x_pos * 8 + 4 + y_pos * dst_stride] = g;\n"
485 " *(unsigned short *)&dstRGB[x_pos * 8 + 6 + y_pos * dst_stride] = b;\n"
489 /* CUDA kernel source for from RGB to YUV conversion and scale */
490 static const gchar templ_RGB_TO_YUV[] =
492 "__constant__ float offset[3] = {%f, %f, %f};\n"
493 "__constant__ float ycoeff[3] = {%f, %f, %f};\n"
494 "__constant__ float ucoeff[3] = {%f, %f, %f};\n"
495 "__constant__ float vcoeff[3] = {%f, %f, %f};\n"
497 "__constant__ float SCALE_H = %f;\n"
498 "__constant__ float SCALE_V = %f;\n"
499 "__constant__ float CHROMA_SCALE_H = %f;\n"
500 "__constant__ float CHROMA_SCALE_V = %f;\n"
501 "__constant__ int WIDTH = %d;\n"
502 "__constant__ int HEIGHT = %d;\n"
503 "__constant__ int CHROMA_WIDTH = %d;\n"
504 "__constant__ int CHROMA_HEIGHT = %d;\n"
505 "__constant__ int IN_DEPTH = %d;\n"
506 "__constant__ int OUT_DEPTH = %d;\n"
507 "__constant__ int PSTRIDE = %d;\n"
508 "__constant__ int CHROMA_PSTRIDE = %d;\n"
509 "__constant__ int IN_SHIFT = %d;\n"
510 "__constant__ int OUT_SHIFT = %d;\n"
511 "__constant__ int MASK = %d;\n"
512 "__constant__ int SWAP_UV = %d;\n"
514 "__device__ unsigned short\n"
515 "do_scale_pixel (unsigned short val) \n"
517 " unsigned int diff;\n"
518 " if (OUT_DEPTH > IN_DEPTH) {\n"
519 " diff = OUT_DEPTH - IN_DEPTH;\n"
520 " return (val << diff) | (val >> (IN_DEPTH - diff));\n"
521 " } else if (IN_DEPTH > OUT_DEPTH) {\n"
522 " return val >> (IN_DEPTH - OUT_DEPTH);\n"
528 "dot(float3 val, float *coeff)\n"
530 " return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n"
534 "rgb_to_yuv (unsigned short r, unsigned short g, unsigned short b,\n"
535 " unsigned int max_val)\n"
537 " float3 rgb = make_float3 (r, g, b);\n"
539 " yuv.x = max ((unsigned int)(dot (rgb, ycoeff) + offset[0]), 0);\n"
540 " yuv.y = max ((unsigned int)(dot (rgb, ucoeff) + offset[1]), 0);\n"
541 " yuv.z = max ((unsigned int)(dot (rgb, vcoeff) + offset[2]), 0);\n"
542 " yuv.x = min (yuv.x, max_val);\n"
543 " yuv.y = min (yuv.y, max_val);\n"
544 " yuv.z = min (yuv.z, max_val);\n"
549 * GST_CUDA_KERNEL_FUNC_TO_ARGB
553 /* __device__ ushort2
554 * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
559 * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,
560 * unsigned short v, int x, int y, int pstride, int stride, int mask);
565 GST_CUDA_KERNEL_FUNC_TO_Y444
566 "(cudaTextureObject_t srcRGB, unsigned char *dstY, int y_stride,\n"
567 " unsigned char *dstU, int u_stride, unsigned char *dstV, int v_stride,\n"
568 " int width, int height, int dst_pstride, int in_depth)\n"
570 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
571 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
572 " if (x_pos < width && y_pos < height) {\n"
573 " ushort4 argb = tex2D<ushort4>(srcRGB, x_pos, y_pos);\n"
575 " yuv = rgb_to_yuv (argb.y, argb.z, argb.w, (1 << in_depth) - 1);\n"
576 " if (in_depth > 8) {\n"
577 " *(unsigned short *)&dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n"
578 " *(unsigned short *)&dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n"
579 " *(unsigned short *)&dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n"
581 " dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n"
582 " dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n"
583 " dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n"
589 GST_CUDA_KERNEL_FUNC_Y444_TO_YUV
590 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
591 " unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n"
594 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
595 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
596 " if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
597 " float src_xpos = SCALE_H * x_pos;\n"
598 " float src_ypos = SCALE_V * y_pos;\n"
599 " unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
600 " y = y >> IN_SHIFT;\n"
601 " y = do_scale_pixel (y);\n"
602 " y = y << OUT_SHIFT;\n"
603 " if (OUT_DEPTH > 8) {\n"
604 " *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n"
606 " dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n"
609 " if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n"
610 " float src_xpos = CHROMA_SCALE_H * x_pos;\n"
611 " float src_ypos = CHROMA_SCALE_V * y_pos;\n"
612 " unsigned short u, v;\n"
614 " uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
617 " u = u >> IN_SHIFT;\n"
618 " v = v >> IN_SHIFT;\n"
619 " u = do_scale_pixel (u);\n"
620 " v = do_scale_pixel (v);\n"
621 " u = u << OUT_SHIFT;\n"
622 " v = v << OUT_SHIFT;\n"
624 " unsigned short tmp = u;\n"
628 " write_chroma (dst1,\n"
629 " dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, stride, MASK);\n"
635 /* CUDA kernel source for from RGB to RGB conversion and scale */
636 static const gchar templ_RGB_to_RGB[] =
638 "__constant__ float SCALE_H = %f;\n"
639 "__constant__ float SCALE_V = %f;\n"
640 "__constant__ int WIDTH = %d;\n"
641 "__constant__ int HEIGHT = %d;\n"
642 "__constant__ int IN_DEPTH = %d;\n"
643 "__constant__ int OUT_DEPTH = %d;\n"
644 "__constant__ int PSTRIDE = %d;\n"
645 "__constant__ int R_IDX = %d;\n"
646 "__constant__ int G_IDX = %d;\n"
647 "__constant__ int B_IDX = %d;\n"
648 "__constant__ int A_IDX = %d;\n"
649 "__constant__ int X_IDX = %d;\n"
651 "__device__ unsigned short\n"
652 "do_scale_pixel (unsigned short val) \n"
654 " unsigned int diff;\n"
655 " if (OUT_DEPTH > IN_DEPTH) {\n"
656 " diff = OUT_DEPTH - IN_DEPTH;\n"
657 " return (val << diff) | (val >> (IN_DEPTH - diff));\n"
658 " } else if (IN_DEPTH > OUT_DEPTH) {\n"
659 " return val >> (IN_DEPTH - OUT_DEPTH);\n"
665 * GST_CUDA_KERNEL_FUNC_TO_ARGB
669 /* convert ARGB or ARGB64 to other RGB formats with scale */
671 GST_CUDA_KERNEL_FUNC_SCALE_RGB
672 "(cudaTextureObject_t srcRGB, unsigned char *dstRGB, int dst_stride)\n"
674 " int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
675 " int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
676 " if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
677 " float src_xpos = SCALE_H * x_pos;\n"
678 " float src_ypos = SCALE_V * y_pos;\n"
679 " ushort4 argb = tex2D<ushort4>(srcRGB, src_xpos, src_ypos);\n"
680 " argb.x = do_scale_pixel(argb.x);\n"
681 " argb.y = do_scale_pixel(argb.y);\n"
682 " argb.z = do_scale_pixel(argb.z);\n"
683 " argb.w = do_scale_pixel(argb.w);\n"
684 /* FIXME: RGB10A2_LE or BGR10A2_LE only */
685 " if (OUT_DEPTH > 8) {\n"
686 " unsigned int packed_rgb = 0;\n"
687 " unsigned int a, r, g, b;"
688 " a = (argb.x >> 8) & 0x3;\n"
689 " r = argb.y & 0x3ff;\n"
690 " g = argb.z & 0x3ff;\n"
691 " b = argb.w & 0x3ff;\n"
692 /* A is always MSB, we support only little endian system */
693 " packed_rgb = a << 30;\n"
694 " packed_rgb |= (r << (30 - (R_IDX * 10)));\n"
695 " packed_rgb |= (g << (30 - (G_IDX * 10)));\n"
696 " packed_rgb |= (b << (30 - (B_IDX * 10)));\n"
697 " *(unsigned int *)&dstRGB[x_pos * 4 + y_pos * dst_stride] = packed_rgb;\n"
699 " if (A_IDX >= 0) {\n"
700 " argb.x = do_scale_pixel(argb.x);\n"
701 " dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * dst_stride] = argb.x;\n"
702 " } else if (X_IDX >= 0) {\n"
703 " dstRGB[x_pos * PSTRIDE + X_IDX + y_pos * dst_stride] = 0xff;\n"
705 " dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * dst_stride] = argb.y;\n"
706 " dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * dst_stride] = argb.z;\n"
707 " dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * dst_stride] = argb.w;\n"
726 CUdeviceptr device_ptr;
728 } GstCudaStageBuffer;
730 #define CONVERTER_MAX_NUM_FUNC 4
732 struct _GstCudaConverter
734 GstVideoInfo in_info;
735 GstVideoInfo out_info;
738 gint texture_alignment;
740 GstCudaContext *cuda_ctx;
741 CUmodule cuda_module;
742 CUfunction kernel_func[CONVERTER_MAX_NUM_FUNC];
743 const gchar *func_names[CONVERTER_MAX_NUM_FUNC];
744 gchar *kernel_source;
746 GstCudaStageBuffer fallback_buffer[GST_VIDEO_MAX_PLANES];
748 gboolean (*convert) (GstCudaConverter * convert, const GstCudaMemory * src,
749 GstVideoInfo * in_info, GstCudaMemory * dst, GstVideoInfo * out_info,
750 CUstream cuda_stream);
752 const CUdeviceptr src;
753 GstVideoInfo *cur_in_info;
756 GstVideoInfo *cur_out_info;
758 /* rgb to {rgb, yuv} only */
759 GstCudaRGBOrder in_rgb_order;
760 GstCudaStageBuffer unpack_surface;
761 GstCudaStageBuffer y444_surface[GST_VIDEO_MAX_PLANES];
764 #define LOAD_CUDA_FUNC(module,func,name) G_STMT_START { \
765 if (!gst_cuda_result (CuModuleGetFunction (&(func), (module), name))) { \
766 GST_ERROR ("failed to get %s function", (name)); \
772 * gst_cuda_converter_new:
773 * @in_info: a #GstVideoInfo
774 * @out_info: a #GstVideoInfo
775 * @cuda_ctx: (transfer none): a #GstCudaContext
777 * Create a new converter object to convert between @in_info and @out_info
780 * Returns: a #GstCudaConverter or %NULL if conversion is not possible.
783 gst_cuda_converter_new (GstVideoInfo * in_info, GstVideoInfo * out_info,
784 GstCudaContext * cuda_ctx)
786 GstCudaConverter *convert;
789 g_return_val_if_fail (in_info != NULL, NULL);
790 g_return_val_if_fail (out_info != NULL, NULL);
791 g_return_val_if_fail (cuda_ctx != NULL, NULL);
792 /* we won't ever do framerate conversion */
793 g_return_val_if_fail (in_info->fps_n == out_info->fps_n, NULL);
794 g_return_val_if_fail (in_info->fps_d == out_info->fps_d, NULL);
795 /* we won't ever do deinterlace */
796 g_return_val_if_fail (in_info->interlace_mode == out_info->interlace_mode,
799 convert = g_new0 (GstCudaConverter, 1);
801 convert->in_info = *in_info;
802 convert->out_info = *out_info;
804 /* FIXME: should return kernel source */
805 if (!gst_cuda_context_push (cuda_ctx)) {
806 GST_ERROR ("cannot push context");
810 if (!cuda_converter_lookup_path (convert))
813 convert->ptx = gst_cuda_nvrtc_compile (convert->kernel_source);
815 GST_ERROR ("no PTX data to load");
819 GST_TRACE ("compiled convert ptx \n%s", convert->ptx);
821 if (!gst_cuda_result (CuModuleLoadData (&convert->cuda_module, convert->ptx))) {
822 gst_cuda_context_pop (NULL);
823 GST_ERROR ("failed to load cuda module data");
828 for (i = 0; i < CONVERTER_MAX_NUM_FUNC; i++) {
829 if (!convert->func_names[i])
832 LOAD_CUDA_FUNC (convert->cuda_module, convert->kernel_func[i],
833 convert->func_names[i]);
834 GST_DEBUG ("kernel function \"%s\" loaded", convert->func_names[i]);
837 gst_cuda_context_pop (NULL);
838 convert->cuda_ctx = gst_object_ref (cuda_ctx);
839 convert->texture_alignment =
840 gst_cuda_context_get_texture_alignment (cuda_ctx);
842 g_free (convert->kernel_source);
843 g_free (convert->ptx);
844 convert->kernel_source = NULL;
850 gst_cuda_context_pop (NULL);
851 gst_cuda_converter_free (convert);
857 * gst_video_converter_free:
858 * @convert: a #GstCudaConverter
863 gst_cuda_converter_free (GstCudaConverter * convert)
865 g_return_if_fail (convert != NULL);
867 if (convert->cuda_ctx) {
868 if (gst_cuda_context_push (convert->cuda_ctx)) {
871 if (convert->cuda_module) {
872 gst_cuda_result (CuModuleUnload (convert->cuda_module));
875 for (i = 0; i < GST_VIDEO_MAX_PLANES; i++) {
876 if (convert->fallback_buffer[i].device_ptr)
877 gst_cuda_result (CuMemFree (convert->fallback_buffer[i].device_ptr));
878 if (convert->y444_surface[i].device_ptr)
879 gst_cuda_result (CuMemFree (convert->y444_surface[i].device_ptr));
882 if (convert->unpack_surface.device_ptr)
883 gst_cuda_result (CuMemFree (convert->unpack_surface.device_ptr));
885 gst_cuda_context_pop (NULL);
888 gst_object_unref (convert->cuda_ctx);
891 g_free (convert->kernel_source);
892 g_free (convert->ptx);
897 * gst_cuda_converter_frame:
898 * @convert: a #GstCudaConverter
899 * @src: a #GstCudaMemory
900 * @in_info: a #GstVideoInfo representing @src
901 * @dst: a #GstCudaMemory
902 * @out_info: a #GstVideoInfo representing @dst
903 * @cuda_stream: a #CUstream
905 * Convert the pixels of @src into @dest using @convert.
906 * Called without gst_cuda_context_push() and gst_cuda_context_pop() by caller
909 gst_cuda_converter_frame (GstCudaConverter * convert, const GstCudaMemory * src,
910 GstVideoInfo * in_info, GstCudaMemory * dst, GstVideoInfo * out_info,
911 CUstream cuda_stream)
915 g_return_val_if_fail (convert, FALSE);
916 g_return_val_if_fail (src, FALSE);
917 g_return_val_if_fail (in_info, FALSE);
918 g_return_val_if_fail (dst, FALSE);
919 g_return_val_if_fail (out_info, FALSE);
921 gst_cuda_context_push (convert->cuda_ctx);
923 ret = gst_cuda_converter_frame_unlocked (convert,
924 src, in_info, dst, out_info, cuda_stream);
926 gst_cuda_context_pop (NULL);
932 * gst_cuda_converter_frame_unlocked:
933 * @convert: a #GstCudaConverter
934 * @src: a #GstCudaMemory
935 * @in_info: a #GstVideoInfo representing @src
936 * @dst: a #GstCudaMemory
937 * @out_info: a #GstVideoInfo representing @dest
938 * @cuda_stream: a #CUstream
940 * Convert the pixels of @src into @dest using @convert.
941 * Caller should call this method after gst_cuda_context_push()
944 gst_cuda_converter_frame_unlocked (GstCudaConverter * convert,
945 const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
946 GstVideoInfo * out_info, CUstream cuda_stream)
948 g_return_val_if_fail (convert, FALSE);
949 g_return_val_if_fail (src, FALSE);
950 g_return_val_if_fail (in_info, FALSE);
951 g_return_val_if_fail (dst, FALSE);
952 g_return_val_if_fail (out_info, FALSE);
954 return convert->convert (convert, src, in_info, dst, out_info, cuda_stream);
957 /* allocate fallback memory for texture alignment requirement */
959 convert_ensure_fallback_memory (GstCudaConverter * convert,
960 GstVideoInfo * info, guint plane)
963 guint element_size = 8;
965 if (convert->fallback_buffer[plane].device_ptr)
968 if (GST_VIDEO_INFO_COMP_DEPTH (info, 0) > 8)
971 ret = CuMemAllocPitch (&convert->fallback_buffer[plane].device_ptr,
972 &convert->fallback_buffer[plane].cuda_stride,
973 GST_VIDEO_INFO_COMP_WIDTH (info, plane) *
974 GST_VIDEO_INFO_COMP_PSTRIDE (info, plane),
975 GST_VIDEO_INFO_COMP_HEIGHT (info, plane), element_size);
977 if (!gst_cuda_result (ret)) {
978 GST_ERROR ("failed to allocated fallback memory");
985 /* create a 2D CUDA texture without alignment check */
987 convert_create_texture_unchecked (const CUdeviceptr src, gint width,
988 gint height, gint channels, gint stride, CUarray_format format,
989 CUfilter_mode mode, CUstream cuda_stream)
991 CUDA_TEXTURE_DESC texture_desc;
992 CUDA_RESOURCE_DESC resource_desc;
993 CUtexObject texture = 0;
996 memset (&texture_desc, 0, sizeof (CUDA_TEXTURE_DESC));
997 memset (&resource_desc, 0, sizeof (CUDA_RESOURCE_DESC));
999 resource_desc.resType = CU_RESOURCE_TYPE_PITCH2D;
1000 resource_desc.res.pitch2D.format = format;
1001 resource_desc.res.pitch2D.numChannels = channels;
1002 resource_desc.res.pitch2D.width = width;
1003 resource_desc.res.pitch2D.height = height;
1004 resource_desc.res.pitch2D.pitchInBytes = stride;
1005 resource_desc.res.pitch2D.devPtr = src;
1007 texture_desc.filterMode = mode;
1008 texture_desc.flags = CU_TRSF_READ_AS_INTEGER;
1010 gst_cuda_result (CuStreamSynchronize (cuda_stream));
1011 cuda_ret = CuTexObjectCreate (&texture, &resource_desc, &texture_desc, NULL);
1013 if (!gst_cuda_result (cuda_ret)) {
1014 GST_ERROR ("couldn't create texture");
1023 convert_create_texture (GstCudaConverter * convert, const GstCudaMemory * src,
1024 GstVideoInfo * info, guint plane, CUstream cuda_stream)
1026 CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
1028 CUdeviceptr src_ptr;
1033 if (GST_VIDEO_INFO_COMP_DEPTH (info, plane) > 8)
1034 format = CU_AD_FORMAT_UNSIGNED_INT16;
1036 /* FIXME: more graceful method ? */
1038 GST_VIDEO_INFO_N_PLANES (info) != GST_VIDEO_INFO_N_COMPONENTS (info)) {
1042 src_ptr = src->data + src->offset[plane];
1043 stride = src->stride;
1045 if (convert->texture_alignment && (src_ptr % convert->texture_alignment)) {
1046 CUDA_MEMCPY2D copy_params = { 0, };
1048 if (!convert_ensure_fallback_memory (convert, info, plane))
1051 GST_LOG ("device memory was not aligned, copy to fallback memory");
1053 copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1054 copy_params.srcPitch = stride;
1055 copy_params.srcDevice = (CUdeviceptr) src_ptr;
1057 copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1058 copy_params.dstPitch = convert->fallback_buffer[plane].cuda_stride;
1059 copy_params.dstDevice = convert->fallback_buffer[plane].device_ptr;
1060 copy_params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, plane)
1061 * GST_VIDEO_INFO_COMP_PSTRIDE (info, plane);
1062 copy_params.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, plane);
1064 cuda_ret = CuMemcpy2DAsync (©_params, cuda_stream);
1065 if (!gst_cuda_result (cuda_ret)) {
1066 GST_ERROR ("failed to copy to fallback buffer");
1070 src_ptr = convert->fallback_buffer[plane].device_ptr;
1071 stride = convert->fallback_buffer[plane].cuda_stride;
1074 /* Use h/w linear interpolation only when resize is required.
1075 * Otherwise the image might be blurred */
1076 if (convert->keep_size)
1077 mode = CU_TR_FILTER_MODE_POINT;
1079 mode = CU_TR_FILTER_MODE_LINEAR;
1081 return convert_create_texture_unchecked (src_ptr,
1082 GST_VIDEO_INFO_COMP_WIDTH (info, plane),
1083 GST_VIDEO_INFO_COMP_HEIGHT (info, plane), channels, stride, format, mode,
1087 /* main conversion function for YUV to YUV conversion */
1089 convert_YUV_TO_YUV (GstCudaConverter * convert,
1090 const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1091 GstVideoInfo * out_info, CUstream cuda_stream)
1093 CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, };
1095 gboolean ret = FALSE;
1096 CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, };
1101 gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2],
1102 &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride
1106 * STEP 1: create CUtexObject per plane
1107 * STEP 2: call YUV to YUV conversion kernel function.
1108 * resize, uv reordering and bitdepth conversion will be performed in
1109 * the CUDA kernel function
1112 /* map CUDA device memory to CUDA texture object */
1113 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1114 texture[i] = convert_create_texture (convert, src, in_info, i, cuda_stream);
1116 GST_ERROR ("couldn't create texture for %d th plane", i);
1121 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (out_info); i++)
1122 dst_ptr[i] = dst->data + dst->offset[i];
1124 dst_stride = dst->stride;
1126 width = GST_VIDEO_INFO_WIDTH (out_info);
1127 height = GST_VIDEO_INFO_HEIGHT (out_info);
1130 CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X),
1131 DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1132 cuda_stream, kernel_args, NULL);
1134 if (!gst_cuda_result (cuda_ret)) {
1135 GST_ERROR ("could not rescale plane");
1140 gst_cuda_result (CuStreamSynchronize (cuda_stream));
1143 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1145 gst_cuda_result (CuTexObjectDestroy (texture[i]));
1151 /* main conversion function for YUV to RGB conversion */
1153 convert_YUV_TO_RGB (GstCudaConverter * convert,
1154 const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1155 GstVideoInfo * out_info, CUstream cuda_stream)
1157 CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, };
1159 gboolean ret = FALSE;
1160 CUdeviceptr dstRGB = 0;
1165 gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2],
1166 &dstRGB, &dst_stride
1170 * STEP 1: create CUtexObject per plane
1171 * STEP 2: call YUV to RGB conversion kernel function.
1172 * resizing, argb ordering and bitdepth conversion will be performed in
1173 * the CUDA kernel function
1176 /* map CUDA device memory to CUDA texture object */
1177 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1178 texture[i] = convert_create_texture (convert, src, in_info, i, cuda_stream);
1180 GST_ERROR ("couldn't create texture for %d th plane", i);
1186 dst_stride = dst->stride;
1188 width = GST_VIDEO_INFO_WIDTH (out_info);
1189 height = GST_VIDEO_INFO_HEIGHT (out_info);
1192 CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X),
1193 DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1194 cuda_stream, kernel_args, NULL);
1196 if (!gst_cuda_result (cuda_ret)) {
1197 GST_ERROR ("could not rescale plane");
1202 gst_cuda_result (CuStreamSynchronize (cuda_stream));
1205 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
1207 gst_cuda_result (CuTexObjectDestroy (texture[i]));
1214 convert_UNPACK_RGB (GstCudaConverter * convert, CUfunction kernel_func,
1215 CUstream cuda_stream, const GstCudaMemory * src, GstVideoInfo * in_info,
1216 CUdeviceptr dst, gint dst_stride, GstCudaRGBOrder * rgb_order)
1218 CUdeviceptr srcRGB = 0;
1220 gint src_stride, src_pstride;
1223 gpointer unpack_kernel_args[] = { &srcRGB, &dst,
1225 &src_stride, &src_pstride, &dst_stride,
1226 &convert->in_rgb_order.R, &convert->in_rgb_order.G,
1227 &convert->in_rgb_order.B, &convert->in_rgb_order.A,
1231 src_stride = src->stride;
1233 width = GST_VIDEO_INFO_WIDTH (in_info);
1234 height = GST_VIDEO_INFO_HEIGHT (in_info);
1235 src_pstride = GST_VIDEO_INFO_COMP_PSTRIDE (in_info, 0);
1238 CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X),
1239 DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1240 cuda_stream, unpack_kernel_args, NULL);
1242 if (!gst_cuda_result (cuda_ret)) {
1243 GST_ERROR ("could not unpack rgb");
1251 convert_TO_Y444 (GstCudaConverter * convert, CUfunction kernel_func,
1252 CUstream cuda_stream, CUtexObject srcRGB, CUdeviceptr dstY, gint y_stride,
1253 CUdeviceptr dstU, gint u_stride, CUdeviceptr dstV, gint v_stride,
1254 gint width, gint height, gint pstride, gint bitdepth)
1258 gpointer kernel_args[] = { &srcRGB, &dstY, &y_stride, &dstU, &u_stride, &dstV,
1259 &v_stride, &width, &height, &pstride, &bitdepth,
1263 CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X),
1264 DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1265 cuda_stream, kernel_args, NULL);
1267 if (!gst_cuda_result (cuda_ret)) {
1268 GST_ERROR ("could not unpack rgb");
1275 /* main conversion function for RGB to YUV conversion */
1277 convert_RGB_TO_YUV (GstCudaConverter * convert,
1278 const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1279 GstVideoInfo * out_info, CUstream cuda_stream)
1281 CUtexObject texture = 0;
1282 CUtexObject yuv_texture[3] = { 0, };
1283 CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, };
1285 gboolean ret = FALSE;
1286 gint in_width, in_height;
1287 gint out_width, out_height;
1289 CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
1290 CUfilter_mode mode = CU_TR_FILTER_MODE_POINT;
1295 gpointer kernel_args[] = { &yuv_texture[0], &yuv_texture[1], &yuv_texture[2],
1296 &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride
1300 * STEP 1: unpack src RGB into ARGB or ARGB64 format
1301 * STEP 2: convert unpacked ARGB (or ARGB64) to Y444 (or Y444_16LE)
1302 * STEP 3: convert Y444 (or Y444_16LE) to final YUV format.
1303 * resizing, bitdepth conversion, uv reordering will be performed in
1304 * the CUDA kernel function
1306 if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream,
1307 src, in_info, convert->unpack_surface.device_ptr,
1308 convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) {
1309 GST_ERROR ("could not unpack input rgb");
1314 in_width = GST_VIDEO_INFO_WIDTH (in_info);
1315 in_height = GST_VIDEO_INFO_HEIGHT (in_info);
1317 out_width = GST_VIDEO_INFO_WIDTH (out_info);
1318 out_height = GST_VIDEO_INFO_HEIGHT (out_info);
1319 dst_stride = dst->stride;
1321 if (GST_VIDEO_INFO_COMP_DEPTH (in_info, 0) > 8) {
1324 format = CU_AD_FORMAT_UNSIGNED_INT16;
1328 convert_create_texture_unchecked (convert->unpack_surface.device_ptr,
1329 in_width, in_height, 4, convert->unpack_surface.cuda_stride, format,
1333 GST_ERROR ("could not create texture");
1337 if (!convert_TO_Y444 (convert, convert->kernel_func[1], cuda_stream, texture,
1338 convert->y444_surface[0].device_ptr,
1339 convert->y444_surface[0].cuda_stride,
1340 convert->y444_surface[1].device_ptr,
1341 convert->y444_surface[1].cuda_stride,
1342 convert->y444_surface[2].device_ptr,
1343 convert->y444_surface[2].cuda_stride, in_width, in_height, pstride,
1345 GST_ERROR ("could not convert to Y444 or Y444_16LE");
1349 /* Use h/w linear interpolation only when resize is required.
1350 * Otherwise the image might be blurred */
1351 if (convert->keep_size)
1352 mode = CU_TR_FILTER_MODE_POINT;
1354 mode = CU_TR_FILTER_MODE_LINEAR;
1356 for (i = 0; i < 3; i++) {
1358 convert_create_texture_unchecked (convert->y444_surface[i].device_ptr,
1359 in_width, in_height, 1, convert->y444_surface[i].cuda_stride, format,
1362 if (!yuv_texture[i]) {
1363 GST_ERROR ("could not create %dth yuv texture", i);
1368 for (i = 0; i < GST_VIDEO_INFO_N_PLANES (out_info); i++)
1369 dst_ptr[i] = dst->data + dst->offset[i];
1372 CuLaunchKernel (convert->kernel_func[2], DIV_UP (out_width, CUDA_BLOCK_X),
1373 DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1374 cuda_stream, kernel_args, NULL);
1376 if (!gst_cuda_result (cuda_ret)) {
1377 GST_ERROR ("could not rescale plane");
1382 gst_cuda_result (CuStreamSynchronize (cuda_stream));
1386 gst_cuda_result (CuTexObjectDestroy (texture));
1387 for (i = 0; i < 3; i++) {
1389 gst_cuda_result (CuTexObjectDestroy (yuv_texture[i]));
1395 /* main conversion function for RGB to RGB conversion */
1397 convert_RGB_TO_RGB (GstCudaConverter * convert,
1398 const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
1399 GstVideoInfo * out_info, CUstream cuda_stream)
1401 CUtexObject texture = 0;
1403 gboolean ret = FALSE;
1404 CUdeviceptr dstRGB = 0;
1405 gint in_width, in_height;
1406 gint out_width, out_height;
1409 CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
1411 gpointer rescale_kernel_args[] = { &texture, &dstRGB, &dst_stride };
1414 * STEP 1: unpack src RGB into ARGB or ARGB64 format
1415 * STEP 2: convert ARGB (or ARGB64) to final RGB format.
1416 * resizing, bitdepth conversion, argb reordering will be performed in
1417 * the CUDA kernel function
1420 if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream,
1421 src, in_info, convert->unpack_surface.device_ptr,
1422 convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) {
1423 GST_ERROR ("could not unpack input rgb");
1428 in_width = GST_VIDEO_INFO_WIDTH (in_info);
1429 in_height = GST_VIDEO_INFO_HEIGHT (in_info);
1431 out_width = GST_VIDEO_INFO_WIDTH (out_info);
1432 out_height = GST_VIDEO_INFO_HEIGHT (out_info);
1435 dst_stride = dst->stride;
1437 if (GST_VIDEO_INFO_COMP_DEPTH (in_info, 0) > 8)
1438 format = CU_AD_FORMAT_UNSIGNED_INT16;
1440 /* Use h/w linear interpolation only when resize is required.
1441 * Otherwise the image might be blurred */
1442 if (convert->keep_size)
1443 mode = CU_TR_FILTER_MODE_POINT;
1445 mode = CU_TR_FILTER_MODE_LINEAR;
1448 convert_create_texture_unchecked (convert->unpack_surface.device_ptr,
1449 in_width, in_height, 4, convert->unpack_surface.cuda_stride, format,
1453 GST_ERROR ("could not create texture");
1458 CuLaunchKernel (convert->kernel_func[1], DIV_UP (out_width, CUDA_BLOCK_X),
1459 DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
1460 cuda_stream, rescale_kernel_args, NULL);
1462 if (!gst_cuda_result (cuda_ret)) {
1463 GST_ERROR ("could not rescale plane");
1468 gst_cuda_result (CuStreamSynchronize (cuda_stream));
1472 gst_cuda_result (CuTexObjectDestroy (texture));
1477 /* from video-converter.c */
1484 color_matrix_set_identity (MatrixData * m)
1488 for (i = 0; i < 4; i++) {
1489 for (j = 0; j < 4; j++) {
1490 m->dm[i][j] = (i == j);
1496 color_matrix_copy (MatrixData * d, const MatrixData * s)
1500 for (i = 0; i < 4; i++)
1501 for (j = 0; j < 4; j++)
1502 d->dm[i][j] = s->dm[i][j];
1505 /* Perform 4x4 matrix multiplication:
1506 * - @dst@ = @a@ * @b@
1507 * - @dst@ may be a pointer to @a@ andor @b@
1510 color_matrix_multiply (MatrixData * dst, MatrixData * a, MatrixData * b)
1515 for (i = 0; i < 4; i++) {
1516 for (j = 0; j < 4; j++) {
1518 for (k = 0; k < 4; k++) {
1519 x += a->dm[i][k] * b->dm[k][j];
1524 color_matrix_copy (dst, &tmp);
1528 color_matrix_offset_components (MatrixData * m, gdouble a1, gdouble a2,
1533 color_matrix_set_identity (&a);
1537 color_matrix_multiply (m, &a, m);
1541 color_matrix_scale_components (MatrixData * m, gdouble a1, gdouble a2,
1546 color_matrix_set_identity (&a);
1550 color_matrix_multiply (m, &a, m);
1554 color_matrix_debug (const MatrixData * s)
1556 GST_DEBUG ("[%f %f %f %f]", s->dm[0][0], s->dm[0][1], s->dm[0][2],
1558 GST_DEBUG ("[%f %f %f %f]", s->dm[1][0], s->dm[1][1], s->dm[1][2],
1560 GST_DEBUG ("[%f %f %f %f]", s->dm[2][0], s->dm[2][1], s->dm[2][2],
1562 GST_DEBUG ("[%f %f %f %f]", s->dm[3][0], s->dm[3][1], s->dm[3][2],
1567 color_matrix_YCbCr_to_RGB (MatrixData * m, gdouble Kr, gdouble Kb)
1569 gdouble Kg = 1.0 - Kr - Kb;
1572 {1., 0., 2 * (1 - Kr), 0.},
1573 {1., -2 * Kb * (1 - Kb) / Kg, -2 * Kr * (1 - Kr) / Kg, 0.},
1574 {1., 2 * (1 - Kb), 0., 0.},
1579 color_matrix_multiply (m, &k, m);
1583 color_matrix_RGB_to_YCbCr (MatrixData * m, gdouble Kr, gdouble Kb)
1585 gdouble Kg = 1.0 - Kr - Kb;
1594 x = 1 / (2 * (1 - Kb));
1595 k.dm[1][0] = -x * Kr;
1596 k.dm[1][1] = -x * Kg;
1597 k.dm[1][2] = x * (1 - Kb);
1600 x = 1 / (2 * (1 - Kr));
1601 k.dm[2][0] = x * (1 - Kr);
1602 k.dm[2][1] = -x * Kg;
1603 k.dm[2][2] = -x * Kb;
1611 color_matrix_multiply (m, &k, m);
1615 compute_matrix_to_RGB (GstCudaConverter * convert, MatrixData * data,
1616 GstVideoInfo * info)
1618 gdouble Kr = 0, Kb = 0;
1619 gint offset[4], scale[4];
1621 /* bring color components to [0..1.0] range */
1622 gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset,
1625 color_matrix_offset_components (data, -offset[0], -offset[1], -offset[2]);
1626 color_matrix_scale_components (data, 1 / ((float) scale[0]),
1627 1 / ((float) scale[1]), 1 / ((float) scale[2]));
1629 if (!GST_VIDEO_INFO_IS_RGB (info)) {
1630 /* bring components to R'G'B' space */
1631 if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb))
1632 color_matrix_YCbCr_to_RGB (data, Kr, Kb);
1634 color_matrix_debug (data);
1638 compute_matrix_to_YUV (GstCudaConverter * convert, MatrixData * data,
1639 GstVideoInfo * info)
1641 gdouble Kr = 0, Kb = 0;
1642 gint offset[4], scale[4];
1644 if (!GST_VIDEO_INFO_IS_RGB (info)) {
1645 /* bring components to YCbCr space */
1646 if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb))
1647 color_matrix_RGB_to_YCbCr (data, Kr, Kb);
1650 /* bring color components to nominal range */
1651 gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset,
1654 color_matrix_scale_components (data, (float) scale[0], (float) scale[1],
1656 color_matrix_offset_components (data, offset[0], offset[1], offset[2]);
1658 color_matrix_debug (data);
1662 cuda_converter_get_matrix (GstCudaConverter * convert, MatrixData * matrix,
1663 GstVideoInfo * in_info, GstVideoInfo * out_info)
1665 gboolean same_matrix, same_bits;
1666 guint in_bits, out_bits;
1668 in_bits = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0);
1669 out_bits = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0);
1671 same_bits = in_bits == out_bits;
1672 same_matrix = in_info->colorimetry.matrix == out_info->colorimetry.matrix;
1674 GST_DEBUG ("matrix %d -> %d (%d)", in_info->colorimetry.matrix,
1675 out_info->colorimetry.matrix, same_matrix);
1676 GST_DEBUG ("bits %d -> %d (%d)", in_bits, out_bits, same_bits);
1678 color_matrix_set_identity (matrix);
1680 if (same_bits && same_matrix) {
1681 GST_DEBUG ("conversion matrix is not required");
1686 if (in_bits < out_bits) {
1687 gint scale = 1 << (out_bits - in_bits);
1688 color_matrix_scale_components (matrix,
1689 1 / (float) scale, 1 / (float) scale, 1 / (float) scale);
1692 GST_DEBUG ("to RGB matrix");
1693 compute_matrix_to_RGB (convert, matrix, in_info);
1694 GST_DEBUG ("current matrix");
1695 color_matrix_debug (matrix);
1697 GST_DEBUG ("to YUV matrix");
1698 compute_matrix_to_YUV (convert, matrix, out_info);
1699 GST_DEBUG ("current matrix");
1700 color_matrix_debug (matrix);
1702 if (in_bits > out_bits) {
1703 gint scale = 1 << (in_bits - out_bits);
1704 color_matrix_scale_components (matrix,
1705 (float) scale, (float) scale, (float) scale);
1708 GST_DEBUG ("final matrix");
1709 color_matrix_debug (matrix);
1715 is_uv_swapped (GstVideoFormat format)
1717 static GstVideoFormat swapped_formats[] = {
1718 GST_VIDEO_FORMAT_YV12,
1719 GST_VIDEO_FORMAT_NV21,
1723 for (i = 0; i < G_N_ELEMENTS (swapped_formats); i++) {
1724 if (format == swapped_formats[i])
1733 const gchar *read_chroma;
1734 const gchar *write_chroma;
1735 const gchar *unpack_function;
1736 gfloat scale_h, scale_v;
1737 gfloat chroma_scale_h, chroma_scale_v;
1739 gint chroma_width, chroma_height;
1742 gint pstride, chroma_pstride;
1743 gint in_shift, out_shift;
1746 /* RGBA specific variables */
1748 GstCudaRGBOrder rgb_order;
1749 } GstCudaKernelTempl;
1752 cuda_converter_generate_yuv_to_yuv_kernel_code (GstCudaConverter * convert,
1753 GstCudaKernelTempl * templ)
1755 return g_strdup_printf (templ_YUV_TO_YUV,
1756 templ->scale_h, templ->scale_v, templ->chroma_scale_h,
1757 templ->chroma_scale_v, templ->width, templ->height, templ->chroma_width,
1758 templ->chroma_height, templ->in_depth, templ->out_depth, templ->pstride,
1759 templ->chroma_pstride, templ->in_shift, templ->out_shift, templ->mask,
1760 templ->swap_uv, templ->read_chroma, templ->write_chroma);
1764 cuda_converter_generate_yuv_to_rgb_kernel_code (GstCudaConverter * convert,
1765 GstCudaKernelTempl * templ, MatrixData * matrix)
1767 return g_strdup_printf (templ_YUV_TO_RGB,
1768 matrix->dm[0][3], matrix->dm[1][3], matrix->dm[2][3],
1769 matrix->dm[0][0], matrix->dm[0][1], matrix->dm[0][2],
1770 matrix->dm[1][0], matrix->dm[1][1], matrix->dm[1][2],
1771 matrix->dm[2][0], matrix->dm[2][1], matrix->dm[2][2],
1772 templ->scale_h, templ->scale_v, templ->chroma_scale_h,
1773 templ->chroma_scale_v, templ->width, templ->height, templ->chroma_width,
1774 templ->chroma_height, templ->in_depth, templ->out_depth, templ->pstride,
1775 templ->chroma_pstride, templ->in_shift, templ->out_shift, templ->mask,
1776 templ->swap_uv, templ->max_in_val, templ->rgb_order.R,
1777 templ->rgb_order.G, templ->rgb_order.B, templ->rgb_order.A,
1778 templ->rgb_order.X, templ->read_chroma);
1782 cuda_converter_generate_rgb_to_yuv_kernel_code (GstCudaConverter * convert,
1783 GstCudaKernelTempl * templ, MatrixData * matrix)
1785 return g_strdup_printf (templ_RGB_TO_YUV,
1786 matrix->dm[0][3], matrix->dm[1][3], matrix->dm[2][3],
1787 matrix->dm[0][0], matrix->dm[0][1], matrix->dm[0][2],
1788 matrix->dm[1][0], matrix->dm[1][1], matrix->dm[1][2],
1789 matrix->dm[2][0], matrix->dm[2][1], matrix->dm[2][2],
1790 templ->scale_h, templ->scale_v, templ->chroma_scale_h,
1791 templ->chroma_scale_v, templ->width, templ->height, templ->chroma_width,
1792 templ->chroma_height, templ->in_depth, templ->out_depth, templ->pstride,
1793 templ->chroma_pstride, templ->in_shift, templ->out_shift, templ->mask,
1794 templ->swap_uv, templ->unpack_function, templ->read_chroma,
1795 templ->write_chroma);
1799 cuda_converter_generate_rgb_to_rgb_kernel_code (GstCudaConverter * convert,
1800 GstCudaKernelTempl * templ)
1802 return g_strdup_printf (templ_RGB_to_RGB,
1803 templ->scale_h, templ->scale_v,
1804 templ->width, templ->height,
1805 templ->in_depth, templ->out_depth, templ->pstride,
1806 templ->rgb_order.R, templ->rgb_order.G,
1807 templ->rgb_order.B, templ->rgb_order.A, templ->rgb_order.X,
1808 templ->unpack_function);
1811 #define SET_ORDER(o,r,g,b,a,x) G_STMT_START { \
1820 cuda_converter_get_rgb_order (GstVideoFormat format, GstCudaRGBOrder * order)
1823 case GST_VIDEO_FORMAT_RGBA:
1824 SET_ORDER (order, 0, 1, 2, 3, -1);
1826 case GST_VIDEO_FORMAT_RGBx:
1827 SET_ORDER (order, 0, 1, 2, -1, 3);
1829 case GST_VIDEO_FORMAT_BGRA:
1830 SET_ORDER (order, 2, 1, 0, 3, -1);
1832 case GST_VIDEO_FORMAT_BGRx:
1833 SET_ORDER (order, 2, 1, 0, -1, 3);
1835 case GST_VIDEO_FORMAT_ARGB:
1836 SET_ORDER (order, 1, 2, 3, 0, -1);
1838 case GST_VIDEO_FORMAT_ABGR:
1839 SET_ORDER (order, 3, 2, 1, 0, -1);
1841 case GST_VIDEO_FORMAT_RGB:
1842 SET_ORDER (order, 0, 1, 2, -1, -1);
1844 case GST_VIDEO_FORMAT_BGR:
1845 SET_ORDER (order, 2, 1, 0, -1, -1);
1847 case GST_VIDEO_FORMAT_BGR10A2_LE:
1848 SET_ORDER (order, 1, 2, 3, 0, -1);
1850 case GST_VIDEO_FORMAT_RGB10A2_LE:
1851 SET_ORDER (order, 3, 2, 1, 0, -1);
1854 g_assert_not_reached ();
1860 cuda_converter_lookup_path (GstCudaConverter * convert)
1862 GstVideoFormat in_format, out_format;
1863 gboolean src_yuv, dst_yuv;
1864 gboolean src_planar, dst_planar;
1865 GstCudaKernelTempl templ = { 0, };
1866 GstVideoInfo *in_info, *out_info;
1867 gboolean ret = FALSE;
1870 in_info = &convert->in_info;
1871 out_info = &convert->out_info;
1873 in_format = GST_VIDEO_INFO_FORMAT (in_info);
1874 out_format = GST_VIDEO_INFO_FORMAT (out_info);
1876 src_yuv = GST_VIDEO_INFO_IS_YUV (in_info);
1877 dst_yuv = GST_VIDEO_INFO_IS_YUV (out_info);
1879 src_planar = GST_VIDEO_INFO_N_PLANES (in_info) ==
1880 GST_VIDEO_INFO_N_COMPONENTS (in_info);
1881 dst_planar = GST_VIDEO_INFO_N_PLANES (out_info) ==
1882 GST_VIDEO_INFO_N_COMPONENTS (out_info);
1884 convert->keep_size = (GST_VIDEO_INFO_WIDTH (&convert->in_info) ==
1885 GST_VIDEO_INFO_WIDTH (&convert->out_info) &&
1886 GST_VIDEO_INFO_HEIGHT (&convert->in_info) ==
1887 GST_VIDEO_INFO_HEIGHT (&convert->out_info));
1889 templ.scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 0) /
1890 (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 0);
1891 templ.scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 0) /
1892 (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0);
1893 templ.chroma_scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 1) /
1894 (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 1);
1895 templ.chroma_scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 1) /
1896 (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1);
1897 templ.width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 0);
1898 templ.height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0);
1899 templ.chroma_width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 1);
1900 templ.chroma_height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1);
1902 templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0);
1903 templ.out_depth = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0);
1904 templ.pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 0);
1905 templ.chroma_pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 1);
1906 templ.in_shift = in_info->finfo->shift[0];
1907 templ.out_shift = out_info->finfo->shift[0];
1908 templ.mask = ((1 << templ.out_depth) - 1) << templ.out_shift;
1909 templ.swap_uv = (is_uv_swapped (in_format) != is_uv_swapped (out_format));
1911 if (src_yuv && dst_yuv) {
1912 convert->convert = convert_YUV_TO_YUV;
1914 if (src_planar && dst_planar) {
1915 templ.read_chroma = READ_CHROMA_FROM_PLANAR;
1916 templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
1917 } else if (!src_planar && dst_planar) {
1918 templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
1919 templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
1920 } else if (src_planar && !dst_planar) {
1921 templ.read_chroma = READ_CHROMA_FROM_PLANAR;
1922 templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
1924 templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
1925 templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
1928 convert->kernel_source =
1929 cuda_converter_generate_yuv_to_yuv_kernel_code (convert, &templ);
1930 convert->func_names[0] = GST_CUDA_KERNEL_FUNC;
1933 } else if (src_yuv && !dst_yuv) {
1937 templ.read_chroma = READ_CHROMA_FROM_PLANAR;
1939 templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
1942 templ.max_in_val = (1 << templ.in_depth) - 1;
1943 cuda_converter_get_rgb_order (out_format, &templ.rgb_order);
1945 cuda_converter_get_matrix (convert, &matrix, in_info, out_info);
1946 convert->kernel_source =
1947 cuda_converter_generate_yuv_to_rgb_kernel_code (convert,
1949 convert->func_names[0] = GST_CUDA_KERNEL_FUNC;
1951 convert->convert = convert_YUV_TO_RGB;
1954 } else if (!src_yuv && dst_yuv) {
1956 gsize element_size = 8;
1957 GstVideoFormat unpack_format;
1958 GstVideoFormat y444_format;
1959 GstVideoInfo unpack_info;
1960 GstVideoInfo y444_info;
1964 templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
1966 templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
1968 templ.read_chroma = READ_CHROMA_FROM_PLANAR;
1970 cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order);
1972 if (templ.in_depth > 8) {
1973 /* FIXME: RGB10A2_LE and BGR10A2_LE only */
1975 unpack_format = GST_VIDEO_FORMAT_ARGB64;
1976 y444_format = GST_VIDEO_FORMAT_Y444_16LE;
1977 templ.unpack_function = unpack_to_ARGB64;
1979 unpack_format = GST_VIDEO_FORMAT_ARGB;
1980 y444_format = GST_VIDEO_FORMAT_Y444;
1981 templ.unpack_function = unpack_to_ARGB;
1984 gst_video_info_set_format (&unpack_info,
1985 unpack_format, GST_VIDEO_INFO_WIDTH (in_info),
1986 GST_VIDEO_INFO_HEIGHT (in_info));
1987 gst_video_info_set_format (&y444_info,
1988 y444_format, GST_VIDEO_INFO_WIDTH (in_info),
1989 GST_VIDEO_INFO_HEIGHT (in_info));
1991 templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0);
1993 cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr,
1994 &convert->unpack_surface.cuda_stride,
1995 GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) *
1996 GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0),
1997 GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size);
1999 if (!gst_cuda_result (cuda_ret)) {
2000 GST_ERROR ("couldn't alloc unpack surface");
2004 for (i = 0; i < 3; i++) {
2005 cuda_ret = CuMemAllocPitch (&convert->y444_surface[i].device_ptr,
2006 &convert->y444_surface[i].cuda_stride,
2007 GST_VIDEO_INFO_COMP_WIDTH (&y444_info, i) *
2008 GST_VIDEO_INFO_COMP_PSTRIDE (&y444_info, i),
2009 GST_VIDEO_INFO_COMP_HEIGHT (&y444_info, i), element_size);
2011 if (!gst_cuda_result (cuda_ret)) {
2012 GST_ERROR ("couldn't alloc %dth y444 surface", i);
2017 cuda_converter_get_matrix (convert, &matrix, &unpack_info, &y444_info);
2019 convert->kernel_source =
2020 cuda_converter_generate_rgb_to_yuv_kernel_code (convert,
2023 convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB;
2024 convert->func_names[1] = GST_CUDA_KERNEL_FUNC_TO_Y444;
2025 convert->func_names[2] = GST_CUDA_KERNEL_FUNC_Y444_TO_YUV;
2027 convert->convert = convert_RGB_TO_YUV;
2031 gsize element_size = 8;
2032 GstVideoFormat unpack_format;
2033 GstVideoInfo unpack_info;
2035 cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order);
2036 cuda_converter_get_rgb_order (out_format, &templ.rgb_order);
2038 if (templ.in_depth > 8) {
2039 /* FIXME: RGB10A2_LE and BGR10A2_LE only */
2041 unpack_format = GST_VIDEO_FORMAT_ARGB64;
2042 templ.unpack_function = unpack_to_ARGB64;
2044 unpack_format = GST_VIDEO_FORMAT_ARGB;
2045 templ.unpack_function = unpack_to_ARGB;
2048 gst_video_info_set_format (&unpack_info,
2049 unpack_format, GST_VIDEO_INFO_WIDTH (in_info),
2050 GST_VIDEO_INFO_HEIGHT (in_info));
2052 templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0);
2054 cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr,
2055 &convert->unpack_surface.cuda_stride,
2056 GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) *
2057 GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0),
2058 GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size);
2060 if (!gst_cuda_result (cuda_ret)) {
2061 GST_ERROR ("couldn't alloc unpack surface");
2065 convert->kernel_source =
2066 cuda_converter_generate_rgb_to_rgb_kernel_code (convert, &templ);
2068 convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB;
2069 convert->func_names[1] = GST_CUDA_KERNEL_FUNC_SCALE_RGB;
2071 convert->convert = convert_RGB_TO_RGB;
2077 GST_DEBUG ("no path found");
2082 GST_TRACE ("configured CUDA kernel source\n%s", convert->kernel_source);