int ADims,
int step>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
#endif
__global__ void kernelPointwiseApply1(detail::TensorInfo<scalar, IndexType> a,
IndexType totalElements, const Op op) {
int ADims, int BDims,
int step>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
#endif
__global__ void
kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
int ADims, int BDims, int CDims,
int step>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
#endif
__global__ void
kernelPointwiseApply3(detail::TensorInfo<scalar1, IndexType> a,
int ADims, int BDims, int CDims, int DDims,
int step>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
#endif
__global__ void
kernelPointwiseApply4(detail::TensorInfo<scalar1, IndexType> a,
typename IndexType,
int ADims>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(256, 8)
+C10_LAUNCH_BOUNDS_2(256, 8)
#endif
__global__ void
fused_dropout_kernel(cuda::detail::TensorInfo<scalar_t, IndexType> a,
}
template <typename scalar_t>
- C10_LAUNCH_BOUNDS(1024)
+ C10_LAUNCH_BOUNDS_1(1024)
__global__ void grid_sampler_2d_kernel(
const int nthreads,
TensorInfo<scalar_t, int> input,
}
template <typename scalar_t>
- C10_LAUNCH_BOUNDS(1024)
+ C10_LAUNCH_BOUNDS_1(1024)
__global__ void grid_sampler_3d_kernel(
const int nthreads,
TensorInfo<scalar_t, int> input,
}
template <typename scalar_t>
- C10_LAUNCH_BOUNDS(1024)
+ C10_LAUNCH_BOUNDS_1(1024)
__global__ void grid_sampler_2d_backward_kernel(
const int nthreads,
TensorInfo<scalar_t, int> grad_output,
}
template <typename scalar_t>
- C10_LAUNCH_BOUNDS(1024)
+ C10_LAUNCH_BOUNDS_1(1024)
__global__ void grid_sampler_3d_backward_kernel(
const int nthreads,
TensorInfo<scalar_t, int> grad_output,
namespace at { namespace native {
template<int nt, int vt, typename func_t>
-C10_LAUNCH_BOUNDS(nt, launch_bound2)
+C10_LAUNCH_BOUNDS_2(nt, launch_bound2)
__global__ void elementwise_kernel(int N, func_t f) {
int tid = threadIdx.x;
int nv = nt * vt;
template<typename scalar_t, typename target_t>
__global__ void
#if defined (__HIP_PLATFORM_HCC__)
-C10_LAUNCH_BOUNDS((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
+C10_LAUNCH_BOUNDS_2((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
#endif
ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
// alpha kernel above. (As mentioned above, it might make sense do the calculation in the alpha kernel.)
template<typename scalar_t, typename target_t>
__global__ void
-C10_LAUNCH_BOUNDS((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
+C10_LAUNCH_BOUNDS_2((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
template<typename scalar_t, typename target_t>
__global__ void
#if defined (__HIP_PLATFORM_HCC__)
-C10_LAUNCH_BOUNDS((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
+C10_LAUNCH_BOUNDS_2((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
#endif
ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_data,
const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride,
template<typename scalar_t, typename target_t>
__global__ void
#if defined (__HIP_PLATFORM_HCC__)
-C10_LAUNCH_BOUNDS((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
+C10_LAUNCH_BOUNDS_2((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
#endif
ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride,
template <typename scalar_t, typename accscalar_t, typename index_type, int indexing_kind>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(512, 4)
+C10_LAUNCH_BOUNDS_2(512, 4)
#endif
__global__ void lstm_cell_forward(
TensorInfo<scalar_t, index_type> input,
template <typename scalar_t, typename accscalar_t, typename index_type, int indexing_kind>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(512, 4)
+C10_LAUNCH_BOUNDS_2(512, 4)
#endif
__global__ void lstm_cell_backward(
TensorInfo<scalar_t, index_type> storage,
template <typename scalar_t, typename accscalar_t, typename index_type, int indexing_kind>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(512, 4)
+C10_LAUNCH_BOUNDS_2(512, 4)
#endif
__global__ void gru_cell_forward(
TensorInfo<scalar_t, index_type> Input,
template <typename scalar_t, typename accscalar_t, typename index_type, int indexing_kind>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(512, 4)
+C10_LAUNCH_BOUNDS_2(512, 4)
#endif
__global__ void gru_cell_backward(
TensorInfo<scalar_t, index_type> gradInInput,
std::ostream& operator<<(std::ostream& out, const ReduceConfig& config);
template<int nt, typename R>
-C10_LAUNCH_BOUNDS(nt, 4)
+C10_LAUNCH_BOUNDS_2(nt, 4)
__global__ void reduce_kernel(R reduction) {
reduction.run();
}
return is_last_block_done_shared;
}
-
+
template <bool can_acc>
C10_DEVICE arg_t accumulate_in_output(
out_scalar_t* out, arg_t value,
template <typename scalar_t, typename IndexType>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
#endif
__global__ void
kernel_pointwise_flip_apply2(const cuda::detail::TensorInfo<scalar_t, IndexType> in_tensor_info,
typename FinalizeOp,
int ADims, int BDims>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(512, 4)
+C10_LAUNCH_BOUNDS_2(512, 4)
#endif
__global__ void kernelReduceNoncontigDim_shared
(TensorInfo<T, IndexType> out,
typename FinalizeOp,
int ADims, int BDims>
#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(512, 4)
+C10_LAUNCH_BOUNDS_2(512, 4)
#endif
__global__ void
kernelReduceNoncontigDim(TensorInfo<T, IndexType> out,
int ADims>
__global__ void
#if defined(__HIP_PLATFORM_HCC__)
-C10_LAUNCH_BOUNDS(THC_REDUCE_ALL_BLOCK_SIZE)
+C10_LAUNCH_BOUNDS_1(THC_REDUCE_ALL_BLOCK_SIZE)
#endif
kernelReduceAll(TensorInfo<T, IndexType> in,
IndexType totalElements,
/*
Only instantiates the all 1D special case and the fallback all nD case for
- large (64-bit indexed) tensors to reduce compilation time.
+ large (64-bit indexed) tensors to reduce compilation time.
*/
if (inInfo.dims == 1) {
HANDLE_IN_CASE(uint64_t, 1);
template <typename K, typename V,
int KeyDims, int ValueDims,
typename Comparator, typename IndexType, int Power2SortSize>
-C10_LAUNCH_BOUNDS(1024)
+C10_LAUNCH_BOUNDS_1(1024)
__global__ void
bitonicSortKVInPlace(TensorInfo<K, IndexType> keys,
IndexType keySlices,
}
template <typename T, typename IndexType, int Dim, bool Order>
-C10_LAUNCH_BOUNDS(1024)
+C10_LAUNCH_BOUNDS_1(1024)
__global__ void gatherTopK(TensorInfo<T, IndexType> input,
IndexType inputSliceSize,
IndexType outputSliceSize, // aka `k`
template <typename Dtype, typename Acctype>
#if defined(__HIP_PLATFORM_HCC__)
-C10_LAUNCH_BOUNDS(MULTILABELMARGIN_THREADS)
+C10_LAUNCH_BOUNDS_1(MULTILABELMARGIN_THREADS)
#endif
__global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(Dtype *output,
Dtype *input,
template <typename Dtype, typename Acctype>
#if defined(__HIP_PLATFORM_HCC__)
-C10_LAUNCH_BOUNDS(MULTILABELMARGIN_THREADS)
+C10_LAUNCH_BOUNDS_1(MULTILABELMARGIN_THREADS)
#endif
__global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(Dtype *gradInput,
Dtype *gradOutput,
template <typename T, typename AccumT>
#if defined(__HIP_PLATFORM_HCC__)
-C10_LAUNCH_BOUNDS(1024)
+C10_LAUNCH_BOUNDS_1(1024)
#endif
__global__ void cunn_SpatialClassNLLCriterion_updateOutput_kernel(
T *output,
template <typename Dtype, typename Acctype>
__global__ void
#if __CUDA_ARCH__ >= 320 || defined __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS(CUDA_NUM_THREADS)
+C10_LAUNCH_BOUNDS_1(CUDA_NUM_THREADS)
#endif
LRNFillScale(const int nthreads, const Dtype* const in,
const int num, const int channels, const int height,
template <typename Dtype, typename AccType>
#if defined (__HIP_PLATFORM_HCC__)
-C10_LAUNCH_BOUNDS(BACKWARD_THREADS, 4)
+C10_LAUNCH_BOUNDS_2(BACKWARD_THREADS, 4)
#else
-C10_LAUNCH_BOUNDS(BACKWARD_THREADS, 8)
+C10_LAUNCH_BOUNDS_2(BACKWARD_THREADS, 8)
#endif
__global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
const int64_t* top_mask, const int num, const int channels,
// Borrowed from Theano
// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
template <typename Dtype>
-__global__ void C10_LAUNCH_BOUNDS(CUDA_NUM_THREADS) // ensure that at least 1 block can be resident
+__global__ void C10_LAUNCH_BOUNDS_1(CUDA_NUM_THREADS) // ensure that at least 1 block can be resident
im3d2col_kernel(const int64_t n, const Dtype* data_im,
const int64_t height, const int64_t width, const int64_t depth,
const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
}
template <typename Dtype, typename Acctype>
-__global__ void C10_LAUNCH_BOUNDS(CUDA_NUM_THREADS) // ensure that at least 1 block can be resident
+__global__ void C10_LAUNCH_BOUNDS_1(CUDA_NUM_THREADS) // ensure that at least 1 block can be resident
col2im3d_kernel(const int64_t n, const Dtype* data_col,
const int64_t height, const int64_t width, const int64_t depth,
const int64_t channels,
#include <c10/macros/Macros.h>
template<typename Dtype, typename Acctype>
-C10_LAUNCH_BOUNDS(1024)
+C10_LAUNCH_BOUNDS_1(1024)
__global__ void caffe_gpu_interp2_kernel(const int n,
const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners,
const THCDeviceTensor<Dtype, 5> data1, THCDeviceTensor<Dtype, 5> data2) {
// Backward (adjoint) operation 1 <- 2 (accumulates)
template <typename Dtype, typename Acctype>
-C10_LAUNCH_BOUNDS(1024)
+C10_LAUNCH_BOUNDS_1(1024)
__global__ void caffe_gpu_interp2_kernel_backward(const int n,
const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners,
THCDeviceTensor<Dtype, 5> data1, const THCDeviceTensor<Dtype, 5> data2){
// Kernel for fast unfold+copy
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
template <typename Dtype>
-C10_LAUNCH_BOUNDS(CUDA_NUM_THREADS)
+C10_LAUNCH_BOUNDS_1(CUDA_NUM_THREADS)
__global__ void im2col_kernel(const int64_t n, const Dtype* data_im,
const int64_t height, const int64_t width,
const int64_t ksize_h, const int64_t ksize_w,
}
template <typename Dtype, typename Acctype>
-C10_LAUNCH_BOUNDS(CUDA_NUM_THREADS)
+C10_LAUNCH_BOUNDS_1(CUDA_NUM_THREADS)
__global__ void col2im_kernel(const int64_t n, const Dtype* data_col,
const int64_t height, const int64_t width, const int64_t channels,
const int64_t kernel_h, const int64_t kernel_w,
#define C10_MAX_THREADS_PER_BLOCK(val) (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) : CUDA_THREADS_PER_BLOCK_FALLBACK)
#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) ((((threads_per_block)*(blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) ? (blocks_per_sm) : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / (threads_per_block))))
// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__
-// https://stackoverflow.com/a/8814003 snippet to have macro with an optional argument
#define C10_LAUNCH_BOUNDS_0 __launch_bounds__(256, 4) // default launch bounds that should give good occupancy and versatility across all architectures.
#define C10_LAUNCH_BOUNDS_1(max_threads_per_block) __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))))
#define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))), (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm))))
-#define C10_LAUNCH_BOUNDS_X(x,max_threads_per_block,min_blocks_per_sm,FUNC, ...) FUNC
-#define C10_LAUNCH_BOUNDS(...) C10_LAUNCH_BOUNDS_X(,##__VA_ARGS__, C10_LAUNCH_BOUNDS_2(__VA_ARGS__), C10_LAUNCH_BOUNDS_1(__VA_ARGS__), C10_LAUNCH_BOUNDS_0(__VA_ARGS__))
#else
#define C10_HOST_DEVICE
#define C10_HOST
}
bool RunOnDevice() override {
- return DispatchHelper<TensorTypes<float, at::Half>>::call(this, Input(0));
+ return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
}
template <typename T>
}
bool RunOnDevice() override {
- return DispatchHelper<TensorTypes<float, at::Half>>::call(this, Input(0));
+ return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
}
template <typename T>
const T* X,
T* Y,
CUDAContext* context) const {
- if (std::is_same<T, at::Half>::value) {
- CAFFE_THROW("Float16 is not supported for average_pooling.");
- return false;
- } else {
return avg_pool_functor.GlobalPoolingForward<T, kOrder>(
N, C, HxW, X, Y, context);
- }
}
template <typename T, StorageOrder kOrder>
const T* X,
T* Y,
CUDAContext* context) const {
- if (std::is_same<T, at::Half>::value) {
- CAFFE_THROW("Float16 is not supported for average_pooling.");
- return false;
- } else {
return avg_pool_functor.Forward<T, kOrder>(
N, C, X_dims, Y_dims, kernel, dilation, stride, pads, X, Y, context);
- }
}
template <typename T, StorageOrder kOrder>
const T* Y,
T* dX,
CUDAContext* context) const {
- if (std::is_same<T, at::Half>::value) {
- CAFFE_THROW("Float16 is not supported for average_pooling.");
- return false;
- } else {
return avg_pool_functor.GlobalPoolingBackward<T, kOrder>(
N, C, HxW, dY, X, Y, dX, context);
- }
}
template <typename T, StorageOrder kOrder>
const T* Y,
T* dX,
CUDAContext* context) const {
- if (std::is_same<T, at::Half>::value) {
- CAFFE_THROW("Float16 is not supported for average_pooling.");
- return false;
- } else {
return avg_pool_functor.Backward<T, kOrder>(
N,
C,
Y,
dX,
context);
- }
}
const AveragePoolFunctor<CUDAContext> avg_pool_functor;
const T* X,
T* Y,
CUDAContext* context) const {
- if (std::is_same<T, at::Half>::value) {
- CAFFE_THROW("Float16 is not supported for max_pooling.");
- return false;
- } else {
return max_pool_functor.GlobalPoolingForward<T, kOrder>(
N, C, HxW, X, Y, context);
- }
}
template <typename T, StorageOrder kOrder>
const T* X,
T* Y,
CUDAContext* context) const {
- if (std::is_same<T, at::Half>::value) {
- CAFFE_THROW("Float16 is not supported for max_pooling.");
- return false;
- } else {
return max_pool_functor.Forward<T, kOrder>(
N, C, X_dims, Y_dims, kernel, dilation, stride, pads, X, Y, context);
- }
}
template <typename T, StorageOrder kOrder>
const T* Y,
T* dX,
CUDAContext* context) const {
- if (std::is_same<T, at::Half>::value) {
- CAFFE_THROW("Float16 is not supported for max_pooling.");
- return false;
- } else {
return max_pool_functor.GlobalPoolingBackward<T, kOrder>(
N, C, HxW, dY, X, Y, dX, context);
- }
}
template <typename T, StorageOrder kOrder>
const T* Y,
T* dX,
CUDAContext* context) const {
- if (std::is_same<T, at::Half>::value) {
- CAFFE_THROW("Float16 is not supported for max_pooling.");
- return false;
- } else {
return max_pool_functor.Backward<T, kOrder>(
N,
C,
Y,
dX,
context);
- }
}
const MaxPoolFunctor<CUDAContext> max_pool_functor;