compat/cuda/cuda_runtime.h

   1 /*
   2  * Minimum CUDA compatibility definitions header
   3  *
   4  * Copyright (c) 2019 rcombs
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #ifndef COMPAT_CUDA_CUDA_RUNTIME_H
  24 #define COMPAT_CUDA_CUDA_RUNTIME_H
  25
  26 // Common macros
  27 #define __global__ __attribute__((global))
  28 #define __device__ __attribute__((device))
  29 #define __device_builtin__ __attribute__((device_builtin))
  30 #define __align__(N) __attribute__((aligned(N)))
  31 #define __inline__ __inline__ __attribute__((always_inline))
  32
  33 #define max(a, b) ((a) > (b) ? (a) : (b))
  34 #define min(a, b) ((a) < (b) ? (a) : (b))
  35 #define abs(x) ((x) < 0 ? -(x) : (x))
  36
  37 #define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST))
  38
  39 // Basic typedefs
  40 typedef __device_builtin__ unsigned long long cudaTextureObject_t;
  41
  42 typedef struct __device_builtin__ __align__(2) uchar2
  43 {
  44     unsigned char x, y;
  45 } uchar2;
  46
  47 typedef struct __device_builtin__ __align__(4) ushort2
  48 {
  49     unsigned short x, y;
  50 } ushort2;
  51
  52 typedef struct __device_builtin__ __align__(8) float2
  53 {
  54     float x, y;
  55 } float2;
  56
  57 typedef struct __device_builtin__ __align__(8) int2
  58 {
  59     int x, y;
  60 } int2;
  61
  62 typedef struct __device_builtin__ uint3
  63 {
  64     unsigned int x, y, z;
  65 } uint3;
  66
  67 typedef struct uint3 dim3;
  68
  69 typedef struct __device_builtin__ __align__(4) uchar4
  70 {
  71     unsigned char x, y, z, w;
  72 } uchar4;
  73
  74 typedef struct __device_builtin__ __align__(8) ushort4
  75 {
  76     unsigned short x, y, z, w;
  77 } ushort4;
  78
  79 typedef struct __device_builtin__ __align__(16) int4
  80 {
  81     int x, y, z, w;
  82 } int4;
  83
  84 typedef struct __device_builtin__ __align__(16) float4
  85 {
  86     float x, y, z, w;
  87 } float4;
  88
  89 // Accessors for special registers
  90 #define GETCOMP(reg, comp) \
  91     asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \
  92     ret.comp = tmp;
  93
  94 #define GET(name, reg) static inline __device__ uint3 name() {\
  95     uint3 ret; \
  96     unsigned tmp; \
  97     GETCOMP(reg, x) \
  98     GETCOMP(reg, y) \
  99     GETCOMP(reg, z) \
 100     return ret; \
 101 }
 102
 103 GET(getBlockIdx, ctaid)
 104 GET(getBlockDim, ntid)
 105 GET(getThreadIdx, tid)
 106
 107 // Instead of externs for these registers, we turn access to them into calls into trivial ASM
 108 #define blockIdx (getBlockIdx())
 109 #define blockDim (getBlockDim())
 110 #define threadIdx (getThreadIdx())
 111
 112 // Basic initializers (simple macros rather than inline functions)
 113 #define make_int2(a, b) ((int2){.x = a, .y = b})
 114 #define make_uchar2(a, b) ((uchar2){.x = a, .y = b})
 115 #define make_ushort2(a, b) ((ushort2){.x = a, .y = b})
 116 #define make_float2(a, b) ((float2){.x = a, .y = b})
 117 #define make_int4(a, b, c, d) ((int4){.x = a, .y = b, .z = c, .w = d})
 118 #define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})
 119 #define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})
 120 #define make_float4(a, b, c, d) ((float4){.x = a, .y = b, .z = c, .w = d})
 121
 122 // Conversions from the tex instruction's 4-register output to various types
 123 #define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}
 124
 125 TEX2D(unsigned char, a & 0xFF)
 126 TEX2D(unsigned short, a & 0xFFFF)
 127 TEX2D(float, a)
 128 TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF))
 129 TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF))
 130 TEX2D(float2, make_float2(a, b))
 131 TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF))
 132 TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF))
 133 TEX2D(float4, make_float4(a, b, c, d))
 134
 135 // Template calling tex instruction and converting the output to the selected type
 136 template<typename T>
 137 inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
 138 {
 139   T ret;
 140   unsigned ret1, ret2, ret3, ret4;
 141   asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
 142       "=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) :
 143       "l"(texObject), "f"(x), "f"(y));
 144   conv(&ret, ret1, ret2, ret3, ret4);
 145   return ret;
 146 }
 147
 148 template<>
 149 inline __device__ float4 tex2D<float4>(cudaTextureObject_t texObject, float x, float y)
 150 {
 151     float4 ret;
 152     asm("tex.2d.v4.f32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
 153         "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) :
 154         "l"(texObject), "f"(x), "f"(y));
 155     return ret;
 156 }
 157
 158 template<>
 159 inline __device__ float tex2D<float>(cudaTextureObject_t texObject, float x, float y)
 160 {
 161     return tex2D<float4>(texObject, x, y).x;
 162 }
 163
 164 template<>
 165 inline __device__ float2 tex2D<float2>(cudaTextureObject_t texObject, float x, float y)
 166 {
 167     float4 ret = tex2D<float4>(texObject, x, y);
 168     return make_float2(ret.x, ret.y);
 169 }
 170
 171 // Math helper functions
 172 static inline __device__ float floorf(float a) { return __builtin_floorf(a); }
 173 static inline __device__ float floor(float a) { return __builtin_floorf(a); }
 174 static inline __device__ double floor(double a) { return __builtin_floor(a); }
 175 static inline __device__ float ceilf(float a) { return __builtin_ceilf(a); }
 176 static inline __device__ float ceil(float a) { return __builtin_ceilf(a); }
 177 static inline __device__ double ceil(double a) { return __builtin_ceil(a); }
 178 static inline __device__ float truncf(float a) { return __builtin_truncf(a); }
 179 static inline __device__ float trunc(float a) { return __builtin_truncf(a); }
 180 static inline __device__ double trunc(double a) { return __builtin_trunc(a); }
 181 static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
 182 static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
 183 static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
 184 static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); }
 185
 186 static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); }
 187 static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
 188 static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }
 189 static inline __device__ float __expf(float a) { return __nvvm_ex2_approx_f(a * (float)__builtin_log2(__builtin_exp(1))); }
 190 static inline __device__ float __powf(float a, float b) { return __nvvm_ex2_approx_f(__nvvm_lg2_approx_f(a) * b); }
 191
 192 #endif /* COMPAT_CUDA_CUDA_RUNTIME_H */