#define AT_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
[&] { \
- const at::Type& the_type = TYPE; \
- switch (the_type.scalarType()) { \
+ switch (TYPE) { \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \
default: \
- AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+ AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
} \
}()
#define AT_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, NAME, ...) \
[&] { \
- const at::Type& the_type = TYPE; \
- switch (the_type.scalarType()) { \
+ switch (TYPE) { \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__) \
default: \
- AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+ AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
} \
}()
#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...) \
[&] { \
- const at::Type& the_type = TYPE; \
- switch (the_type.scalarType()) { \
+ switch (TYPE) { \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE( \
at::ScalarType::ComplexHalf, std::complex<at::Half>, __VA_ARGS__) \
default: \
- AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+ AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
} \
}()
#define AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
[&] { \
- const at::Type& the_type = TYPE; \
- switch (the_type.scalarType()) { \
+ switch (TYPE) { \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \
default: \
- AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+ AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
} \
}()
#define AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...) \
[&] { \
- const at::Type& the_type = TYPE; \
- switch (the_type.scalarType()) { \
+ switch (TYPE) { \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \
default: \
- AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+ AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
} \
}()
#define AT_DISPATCH_ALL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \
[&] { \
- const at::Type& the_type = TYPE; \
- switch (the_type.scalarType()) { \
+ switch (TYPE) { \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(SCALARTYPE, MyTemplate<SCALARTYPE>::type, __VA_ARGS__) \
default: \
- AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+ AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
} \
}()
#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(SCALARTYPE, TYPE, NAME, ...) \
[&] { \
- const at::Type& the_type = TYPE; \
- switch (the_type.scalarType()) { \
+ switch (TYPE) { \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \
AT_PRIVATE_CASE_TYPE( \
at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__) \
default: \
- AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+ AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
} \
}()
template <typename T>
inline T load(const void* data, ScalarType src_type) {
- return AT_DISPATCH_ALL_TYPES(CPU(src_type), "load", [&]() {
+ return AT_DISPATCH_ALL_TYPES(src_type, "load", [&]() {
return at::convert<T>(*(scalar_t*)data);
});
}
template <typename T>
inline void store(T value, void* dst, ScalarType dst_type) {
- AT_DISPATCH_ALL_TYPES(CPU(dst_type), "store", [&]() {
+ AT_DISPATCH_ALL_TYPES(dst_type, "store", [&]() {
*(scalar_t*)dst = at::convert<scalar_t>(value);
});
}
// case1: shared weight for all channels
if (weight_num == 1) {
- AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_cpu", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_cpu", [&] {
prelu_cpu_kernel_share_weights<scalar_t>(result, input, weight);
});
}
"Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
" and channel size = ", channel_size, ".");
- AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_cpu", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_cpu", [&] {
prelu_cpu_kernel_multi_weights<scalar_t>(
result,
input,
// case1: shared parameter for all channels
if (weight_num == 1) {
- AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_backward_cpu", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_backward_cpu", [&] {
prelu_cpu_backward_kernel_share_weights<scalar_t>(input, weight, grad_out, input_grad, weight_grad);
});
}
"Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
" and channel size = ", channel_size, ".");
- AT_DISPATCH_FLOATING_TYPES(input.type(), "prelu_backward_cpu", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_backward_cpu", [&] {
prelu_cpu_backward_kernel_multi_weights<scalar_t>(
input,
weight,
// -----------------------------------
Tensor hardshrink_cpu(const Tensor & self, Scalar lambd) {
auto out_tensor = at::empty_like(self);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "hardshrink_cpu", [&] {
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "hardshrink_cpu", [&] {
auto lambd_val = lambd.to<scalar_t>();
at::CPU_tensor_apply2<scalar_t, scalar_t>(
self,
Tensor hardshrink_backward_cpu(const Tensor & grad, const Tensor & self, Scalar lambd) {
auto out_tensor = at::empty_like(self);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "hardshrink_backward_cpu", [&] {
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "hardshrink_backward_cpu", [&] {
auto lambd_val = lambd.to<scalar_t>();
at::CPU_tensor_apply3<scalar_t, scalar_t, scalar_t>(
self,
{
output.resize_({sizeD, osizeH, osizeW});
- AT_DISPATCH_FLOATING_TYPES(input.type(), "adaptive_avg_pool2d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "adaptive_avg_pool2d_cpu", [&] {
auto input_data = input.data<scalar_t>();
auto output_data = output.data<scalar_t>();
adaptive_avg_pool2d_out_frame<scalar_t>(input_data, output_data,
#pragma omp parallel for private(b)
for (b = 0; b < input.size(0); b++)
{
- AT_DISPATCH_FLOATING_TYPES(input.type(), "adaptive_avg_pool2d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "adaptive_avg_pool2d_cpu", [&] {
auto input_data = input.data<scalar_t>();
auto output_data = output.data<scalar_t>();
adaptive_avg_pool2d_out_frame<scalar_t>(input_data+b*input.stride(0), output_data+b*sizeD*osizeH*osizeW,
if (input.ndimension() == 3)
{
AT_DISPATCH_FLOATING_TYPES(
- input.type(), "adaptive_avg_pool2d_backward", [&] {
+ input.scalar_type(), "adaptive_avg_pool2d_backward_cpu", [&] {
/* get raw pointers */
scalar_t *gradInput_data = gradInput.data<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data<scalar_t>();
for (b = 0; b < input.size(0); b++)
{
AT_DISPATCH_FLOATING_TYPES(
- input.type(), "adaptive_avg_pool2d_backward", [&] {
+ input.scalar_type(), "adaptive_avg_pool2d_backward_cpu", [&] {
/* get raw pointers */
scalar_t *gradInput_data = gradInput.data<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data<scalar_t>();
return output;
}
- Tensor adaptive_avg_pool2d(
+ Tensor adaptive_avg_pool2d(
at::Tensor const& input,
IntArrayRef output_size){
if (output_size[0] == 1 && output_size[1] == 1) {
auto self_working_copy = cloneBatchedColumnMajor(self);
auto A_working_copy = cloneBatchedColumnMajor(A);
std::vector<int64_t> infos(batchCount(self), 0);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "gesv", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "gesv_cpu", [&]{
apply_gesv<scalar_t>(self_working_copy, A_working_copy, infos);
});
if (self.dim() > 2) {
- batchCheckErrors(infos, "gesv");
+ batchCheckErrors(infos, "gesv_cpu");
} else {
- singleCheckErrors(infos[0], "gesv");
+ singleCheckErrors(infos[0], "gesv_cpu");
}
return std::tuple<Tensor, Tensor>(self_working_copy, A_working_copy);
}
}
std::tuple<Tensor&,Tensor&> gesv_out(Tensor& solution, Tensor& lu, const Tensor& self, const Tensor& A) {
- AT_CHECK(self.dim() == 2 && A.dim() == 2,
+ AT_CHECK(self.dim() == 2 && A.dim() == 2,
"torch.gesv() with the `out` keyword does not support batching. "
"b.dim() (", self.dim(), ") and A.dim() (", A.dim(), ") must both be 2.");
Tensor solution_tmp, lu_tmp;
Tensor _inverse_helper_cpu(const Tensor& self) {
std::vector<int64_t> infos(batchCount(self), 0);
auto self_working_copy = cloneBatchedColumnMajor(self);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "inverse", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "inverse_cpu", [&]{
apply_inverse<scalar_t>(self_working_copy, infos);
});
- batchCheckErrors(infos, "inverse");
+ batchCheckErrors(infos, "inverse_cpu");
return self_working_copy;
}
auto self_working_copy = cloneBatchedColumnMajor(self);
auto A_working_copy = cloneBatchedColumnMajor(A);
std::vector<int64_t> infos(batchCount(self), 0);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "cholesky_solve", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "cholesky_solve_cpu", [&]{
apply_cholesky_solve<scalar_t>(self_working_copy, A_working_copy, upper, infos);
});
if (self.dim() > 2) {
- batchCheckErrors(infos, "cholesky_solve");
+ batchCheckErrors(infos, "cholesky_solve_cpu");
} else {
- singleCheckErrors(infos[0], "cholesky_solve");
+ singleCheckErrors(infos[0], "cholesky_solve_cpu");
}
return self_working_copy;
}
Tensor _cholesky_helper_cpu(const Tensor& self, bool upper) {
std::vector<int64_t> infos(batchCount(self), 0);
auto self_working_copy = cloneBatchedColumnMajor(self);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "cholesky", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "cholesky_cpu", [&]{
apply_cholesky<scalar_t>(self_working_copy, upper, infos);
});
if (self.dim() > 2) {
- batchCheckErrors(infos, "cholesky");
+ batchCheckErrors(infos, "cholesky_cpu");
} else {
- singleCheckErrors(infos[0], "cholesky");
+ singleCheckErrors(infos[0], "cholesky_cpu");
}
return self_working_copy;
}
bool inplace = checkTrilTriuBatchContiguous(self);
Tensor self_c = inplace ? self : self.contiguous();
Tensor result = inplace ? self : at::empty_like(self);
- AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
+ AT_DISPATCH_ALL_TYPES(self.scalar_type(), "tril", [&]{
apply_triu_tril<scalar_t, false>(result, self_c, inplace, k);
});
if (!inplace) self.copy_(result);
return result;
}
Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
- AT_DISPATCH_ALL_TYPES(self.type(), "tril", [&]{
+ AT_DISPATCH_ALL_TYPES(self.scalar_type(), "tril", [&]{
apply_triu_tril<scalar_t, false>(result, self_c, false, k);
});
return result;
bool inplace = checkTrilTriuBatchContiguous(self);
Tensor self_c = inplace ? self : self.contiguous();
Tensor result = inplace ? self : at::empty_like(self);
- AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
+ AT_DISPATCH_ALL_TYPES(self.scalar_type(), "triu", [&]{
apply_triu_tril<scalar_t, true>(result, self_c, inplace, k);
});
if (!inplace) self.copy_(result);
return result;
}
Tensor self_c = checkTrilTriuBatchContiguous(self) ? self : self.contiguous();
- AT_DISPATCH_ALL_TYPES(self.type(), "triu", [&]{
+ AT_DISPATCH_ALL_TYPES(self.scalar_type(), "triu", [&]{
apply_triu_tril<scalar_t, true>(result, self_c, false, k);
});
return result;
template <typename self_T>
void _copy__cpu(at::Tensor& self, const at::Tensor& src) {
AT_CHECK(self.numel() == src.numel(), "sizes do not match");
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.type(), "_copy__cpu", [&]() {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_copy__cpu", [&]() {
_copy__cpu<self_T, scalar_t>(self, src);
});
}
return self;
}
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, self.type(), "_copy__cpu", [&]() { ::_copy__cpu<scalar_t>(self, src); });
+ at::ScalarType::Half, self.scalar_type(), "_copy__cpu", [&]() { ::_copy__cpu<scalar_t>(self, src); });
return self;
}
Tensor buf = empty({BLOCK_SZ, BLOCK_SZ}, self.options());
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, self.type(), "_copy_same_type_transpose_", [&]() {
+ at::ScalarType::Half, self.scalar_type(), "_copy_same_type_transpose_", [&]() {
scalar_t* sp = src.data<scalar_t>();
scalar_t* rp = self.data<scalar_t>();
scalar_t* bp = buf.data<scalar_t>();
#ifdef _OPENMP
if (!in_parallel_region()) {
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, self.type(), "_copy_same_type_", [&]() {
+ at::ScalarType::Half, self.scalar_type(), "_copy_same_type_", [&]() {
at::CPU_tensor_parallel_apply2<scalar_t, scalar_t>(
self, src, [](scalar_t& self_val, const scalar_t& src_val) {
self_val = src_val;
if (serial_path) {
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, self.type(), "_copy_same_type_", [&]() {
+ at::ScalarType::Half, self.scalar_type(), "_copy_same_type_", [&]() {
at::CPU_tensor_apply2<scalar_t, scalar_t>(
self, src, [](scalar_t& self_val, const scalar_t& src_val) {
self_val = src_val;
}
Tensor& bernoulli_tensor_cpu_(Tensor& self, const Tensor& p_, Generator* gen) {
- AT_DISPATCH_ALL_TYPES(self.type(), "bernoulli_tensor_cpu_self_", [&] {
+ AT_DISPATCH_ALL_TYPES(self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] {
THGenerator* generator = get_generator(gen);
std::lock_guard<std::mutex> lock(generator->mutex);
using self_t = scalar_t;
ret_val = static_cast<self_t>(THRandom_bernoulli(generator, p_val));
});
} else {
- AT_DISPATCH_FLOATING_TYPES(p_.type(), "bernoulli_tensor_cpu_p_", [&] {
+ AT_DISPATCH_FLOATING_TYPES(p_.scalar_type(), "bernoulli_tensor_cpu_p_", [&] {
auto p = std::get<0>(expand_inplace(self, p_.to(kCPU)));
using p_t = scalar_t;
CPU_tensor_apply2<self_t, p_t>(
return self;
}
#endif
- AT_DISPATCH_ALL_TYPES(self.type(), "bernoulli_scalar_cpu_", [&] {
+ AT_DISPATCH_ALL_TYPES(self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
THGenerator* generator = get_generator(gen);
std::lock_guard<std::mutex> lock(generator->mutex);
CPU_tensor_apply1<scalar_t>(
Tensor _standard_gamma_grad_cpu(const Tensor& self, const Tensor& output) {
Tensor ret = at::empty(self.sizes(), self.options());
- AT_DISPATCH_FLOATING_TYPES(self.type(), "_standard_gamma_grad", [&] {
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "_standard_gamma_grad_cpu", [&] {
CPU_tensor_apply3<scalar_t, scalar_t, scalar_t>(ret, self, output,
[](scalar_t& ret_val, const scalar_t& self_val, const scalar_t &output_val) {
ret_val = standard_gamma_grad_one<scalar_t, double>(self_val, output_val);
Tensor _s_poisson_cpu(const Tensor& lambda, Generator *gen) {
Tensor ret = at::zeros(lambda.sizes(), lambda.options());
- AT_DISPATCH_FLOATING_TYPES(ret.type(), "poisson", [&] {
+ AT_DISPATCH_FLOATING_TYPES(ret.scalar_type(), "poisson_cpu", [&] {
THGenerator* generator = get_generator(gen);
std::lock_guard<std::mutex> lock(generator->mutex);
CPU_tensor_apply2<scalar_t, scalar_t>(ret, lambda,
Tensor _s_gamma_cpu(const Tensor& alpha, Generator *gen) {
Tensor ret = at::zeros(alpha.sizes(), alpha.options());
- AT_DISPATCH_FLOATING_TYPES(ret.type(), "gamma", [&] {
+ AT_DISPATCH_FLOATING_TYPES(ret.scalar_type(), "gamma_cpu", [&] {
THGenerator* generator = get_generator(gen);
std::lock_guard<std::mutex> lock(generator->mutex);
CPU_tensor_apply2<scalar_t, scalar_t>(ret, alpha,
return std::tuple<Tensor, Tensor, Tensor, Tensor>(ret, offset2bag, bag_size, bag_size);
} else { // MODE_MAX
return AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- weight.type(), "embedding_bag_cpu_max", [&]() {
+ weight.scalar_type(), "embedding_bag_cpu_max", [&]() {
return embedding_bag_cpu_max<scalar_t>(weight, indices, offset2bag, output, bag_size, offsets);
}
);
indices.resize_({numBatch, numPlanes, outputH, outputW});
}
- AT_DISPATCH_FLOATING_TYPES(input.type(),
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
"fractional_max_pool2d_out_frame", [&] {
auto input_data = input.data<scalar_t>();
auto output_data = output.data<scalar_t>();
/* backprop */
AT_DISPATCH_FLOATING_TYPES(
- input.type(), "fractional_max_pool2d_backward_out_frame", [&] {
+ input.scalar_type(), "fractional_max_pool2d_backward_out_frame", [&] {
auto gradInput_data = gradInput.data<scalar_t>();
auto gradOutput_data = gradOutput.data<scalar_t>();
auto indices_data = indices.data<int64_t>();
indices.resize_({numBatch, numPlanes, outputT, outputH, outputW});
}
AT_DISPATCH_FLOATING_TYPES(
- input.type(),
+ input.scalar_type(),
"fractional_max_pool3d_out_frame",
[&] {
fractional_max_pool3d_out_frame<scalar_t>(
/* backprop */
AT_DISPATCH_FLOATING_TYPES(
- input.type(),
+ input.scalar_type(),
"fractional_max_pool3d_backward_out_frame",
[&]{
fractional_max_pool3d_backward_out_frame<scalar_t>(
// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid,
int64_t interpolation_mode, int64_t padding_mode) {
- return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler3d_cpu", [&] {
+ return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler3d_cpu", [&] {
return grid_sampler_3d_cpu_impl<scalar_t>(
input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode),
static_cast<GridSamplerPadding>(padding_mode));
std::tuple<Tensor, Tensor>
grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
int64_t interpolation_mode, int64_t padding_mode) {
- return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_3d_backward_cpu", [&] {
+ return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_3d_backward_cpu", [&] {
return grid_sampler_3d_backward_cpu_impl<scalar_t>(
grad_output, input, grid,
static_cast<GridSamplerInterpolation>(interpolation_mode),
Tensor b_self, b_end, b_weight;
AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
"weight should be of dimension max(self.dim(), end.dim()) or lesser");
- std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_out");
+ std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_out_cpu");
result.resize_as_(b_self);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "lerp_out", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp_out_cpu", [&]{
lerp_cpu<scalar_t>(result, b_self, b_end, b_weight);
});
return result;
Tensor& lerp_cpu_scalar_out(Tensor& result, const Tensor& self,
const Tensor& end, Scalar weight) {
Tensor b_self, b_end;
- std::tie(b_self, b_end) = expand_outplace(self, end, "lerp_out");
+ std::tie(b_self, b_end) = expand_outplace(self, end, "lerp_out_cpu");
result.resize_as_(b_self);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "lerp_out", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp_out_cpu", [&]{
lerp_cpu<scalar_t>(result, b_self, b_end, weight.to<scalar_t>());
});
return result;
Tensor& lerp_cpu_tensor_(Tensor& self, const Tensor& end, const Tensor& weight) {
Tensor b_self, b_end, b_weight;
- std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_");
+ std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp__cpu");
AT_CHECK(b_self.sizes() == self.sizes(),
"output with shape ", self.sizes(),
" doesn't match the broadcast shape ", b_self.sizes());
AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
"weight should be of dimension max(self.dim(), end.dim()) or lesser");
- AT_DISPATCH_FLOATING_TYPES(self.type(), "lerp_", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cpu", [&]{
lerp_cpu<scalar_t>(self, b_self, b_end, b_weight);
});
return self;
Tensor& lerp_cpu_scalar_(Tensor& self, const Tensor& end, Scalar weight) {
Tensor b_self, b_end;
- std::tie(b_self, b_end) = expand_outplace(self, end, "lerp_");
+ std::tie(b_self, b_end) = expand_outplace(self, end, "lerp__cpu");
AT_CHECK(b_self.sizes() == self.sizes(),
"output with shape ", self.sizes(),
" doesn't match the broadcast shape ", b_self.sizes());
- AT_DISPATCH_FLOATING_TYPES(self.type(), "lerp_", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cpu", [&]{
lerp_cpu<scalar_t>(self, b_self, b_end, weight.to<scalar_t>());
});
return self;
Tensor b_self, b_end, b_weight;
AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
"weight should be of dimension max(self.dim(), end.dim()) or lesser");
- std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp");
+ std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_cpu");
Tensor result = at::empty_like(b_self);
- AT_DISPATCH_FLOATING_TYPES(result.type(), "lerp", [&]{
+ AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "lerp_cpu", [&]{
lerp_cpu<scalar_t>(result, b_self, b_end, b_weight);
});
return result;
Tensor lerp_cpu_scalar(const Tensor& self, const Tensor& end, Scalar weight) {
Tensor b_self, b_end;
- std::tie(b_self, b_end) = expand_outplace(self, end, "lerp");
+ std::tie(b_self, b_end) = expand_outplace(self, end, "lerp_cpu");
Tensor result = at::empty_like(b_self);
- AT_DISPATCH_FLOATING_TYPES(result.type(), "lerp", [&]{
+ AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "lerp_cpu", [&]{
lerp_cpu<scalar_t>(result, b_self, b_end, weight.to<scalar_t>());
});
return result;
if (contraction_size * res_rows * res_cols < 400) {
if (is_bmm_out) {
- AT_DISPATCH_ALL_TYPES(batch1.type(), "bmm", [&] {
+ AT_DISPATCH_ALL_TYPES(batch1.scalar_type(), "bmm", [&] {
baddbmm_cpu_kernel<scalar_t, true>(self_or_result, batch1, batch2, beta, alpha);
});
} else {
- AT_DISPATCH_ALL_TYPES(batch1.type(), "baddbmm", [&] {
+ AT_DISPATCH_ALL_TYPES(batch1.scalar_type(), "baddbmm", [&] {
baddbmm_cpu_kernel<scalar_t, false>(self_or_result, batch1, batch2, beta, alpha);
});
}
Tensor kl_div_backward_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, int64_t reduction) {
auto grad_input = at::zeros_like(input);
auto grad_expand = grad.expand_as(input);
- AT_DISPATCH_FLOATING_TYPES(input.type(), "kl_div_backward", [&]() {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "kl_div_backward_cpu", [&]() {
at::CPU_tensor_apply3<scalar_t, scalar_t, scalar_t>(
grad_input,
target,
std::tuple<Tensor, Tensor> ctc_loss_cpu(const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t BLANK, bool zero_infinity) {
(void)zero_infinity; // only used for backwards
- return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] {
+ return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cpu", [&] {
if (targets.scalar_type() == kLong) {
return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
} else {
Tensor ctc_loss_backward_cpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths,
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
- return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] {
+ return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cpu", [&] {
if (targets.scalar_type() == kLong) {
return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
} else {
std::tuple<Tensor, Tensor> batch_norm_update_stats_cpu(
const Tensor& self, const Tensor& running_mean, const Tensor& running_var, double momentum) {
- return AT_DISPATCH_FLOATING_TYPES(self.type(), "batch_norm_update_stats", [&] {
+ return AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "batch_norm_update_stats_cpu", [&] {
return batch_norm_cpu_update_stats_template<scalar_t, Var>(self, running_mean, running_var, momentum, 0);
});
}
bool train, double momentum, double eps) {
checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU);
- return AT_DISPATCH_FLOATING_TYPES(self.type(), "batch_norm", [&] {
+ return AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "batch_norm", [&] {
if (!train) {
return batch_norm_cpu_transform_input_template<scalar_t>(self, weight, bias, {}, {}, running_mean, running_var, train, eps);
} else {
std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu(const Tensor& grad_out, const Tensor& self, const Tensor& weight,
const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd,
bool train, double eps, std::array<bool,3> grad_input_mask) {
- return AT_DISPATCH_FLOATING_TYPES(self.type(), "batch_norm_backward", [&] {
+ return AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "batch_norm_backward_cpu", [&] {
return batch_norm_backward_cpu_template<scalar_t>(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, eps, grad_input_mask);
});
}
} else if (steps == 1) {
r.fill_(start);
} else {
- AT_DISPATCH_FLOATING_TYPES(r.type(), "linspace", [&]() {
+ AT_DISPATCH_FLOATING_TYPES(r.scalar_type(), "linspace_cpu", [&]() {
scalar_t scalar_start = start.to<scalar_t>();
scalar_t scalar_end = end.to<scalar_t>();
scalar_t *data_ptr = r.data<scalar_t>();
} else if (steps == 1) {
r.fill_(std::pow(10.0, start.to<double>()));
} else {
- AT_DISPATCH_FLOATING_TYPES(r.type(), "logspace", [&]() {
+ AT_DISPATCH_FLOATING_TYPES(r.scalar_type(), "logspace_cpu", [&]() {
scalar_t base10 = 10;
scalar_t scalar_start = start.to<scalar_t>();
scalar_t scalar_end = end.to<scalar_t>();
}
Tensor& range_cpu_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
- AT_DISPATCH_ALL_TYPES(result.type(), "range", [&]() {
+ AT_DISPATCH_ALL_TYPES(result.scalar_type(), "range_cpu", [&]() {
using accscalar_t = at::acc_type<scalar_t, false>;
auto xstart = start.to<accscalar_t>();
auto xend = end.to<accscalar_t>();
}
Tensor& arange_cpu_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
- AT_DISPATCH_ALL_TYPES(result.type(), "arange", [&]() {
+ AT_DISPATCH_ALL_TYPES(result.scalar_type(), "arange_cpu", [&]() {
using accscalar_t = at::acc_type<scalar_t, false>;
auto xstart = start.to<accscalar_t>();
auto xend = end.to<accscalar_t>();
/* resize output */
if (input.ndimension() == 2) {
output.resize_({nplane, output_w});
- AT_DISPATCH_FLOATING_TYPES(input.type(), "reflection_pad1d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "reflection_pad1d", [&] {
reflection_pad1d_out_frame<scalar_t>(
input.data<scalar_t>(), output.data<scalar_t>(),
nplane,
});
} else {
output.resize_({nbatch, nplane, output_w});
- AT_DISPATCH_FLOATING_TYPES(input.type(), "reflection_pad1d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "reflection_pad1d", [&] {
reflection_pad1d_out_loop<scalar_t>(
input.data<scalar_t>(), output.data<scalar_t>(),
nbatch, nplane,
/* backprop */
if (input.ndimension() == 2) {
AT_DISPATCH_FLOATING_TYPES(
- grad_input.type(), "reflection_pad1d_backward", [&] {
+ grad_input.scalar_type(), "reflection_pad1d_backward", [&] {
reflection_pad1d_backward_out_frame(
grad_input.data<scalar_t>(), grad_output.data<scalar_t>(),
nplane,
);
} else {
AT_DISPATCH_FLOATING_TYPES(
- grad_input.type(), "reflection_pad1d_backward", [&] {
+ grad_input.scalar_type(), "reflection_pad1d_backward", [&] {
reflection_pad1d_backward_out_loop(
grad_input.data<scalar_t>(),
grad_output.data<scalar_t>(),
if (input.ndimension() == 3) {
/* resize output */
output.resize_({nplane, output_h, output_w});
- AT_DISPATCH_FLOATING_TYPES(input.type(), "reflection_pad2d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "reflection_pad2d", [&] {
reflection_pad2d_out_frame(
input.data<scalar_t>(), output.data<scalar_t>(),
nplane,
} else {
/* resize output */
output.resize_({nbatch, nplane, output_h, output_w});
- AT_DISPATCH_FLOATING_TYPES(input.type(), "reflection_pad2d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "reflection_pad2d", [&] {
reflection_pad2d_out_loop(
input.data<scalar_t>(), output.data<scalar_t>(),
nbatch, nplane,
/* backprop */
if (input.ndimension() == 3) {
AT_DISPATCH_FLOATING_TYPES(
- grad_output.type(), "reflection_pad2d_backward", [&] {
+ grad_output.scalar_type(), "reflection_pad2d_backward", [&] {
reflection_pad2d_backward_out_frame(
grad_input.data<scalar_t>(), grad_output.data<scalar_t>(),
nplane,
);
} else {
AT_DISPATCH_FLOATING_TYPES(
- grad_output.type(), "reflection_pad2d_backward", [&] {
+ grad_output.scalar_type(), "reflection_pad2d_backward", [&] {
reflection_pad2d_backward_out_loop(
grad_input.data<scalar_t>(), grad_output.data<scalar_t>(),
nbatch, nplane,
if (input.ndimension() == 2)
{
output.resize_({nslices, owidth});
- AT_DISPATCH_FLOATING_TYPES(input.type(), "replication_pad1d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "replication_pad1d_cpu", [&] {
auto input_data = input.data<scalar_t>();
auto output_data = output.data<scalar_t>();
replication_pad1d_out_frame<scalar_t>(
else
{
output.resize_({nbatch, nslices, owidth});
- AT_DISPATCH_FLOATING_TYPES(input.type(), "replication_pad1d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "replication_pad1d_cpu", [&] {
auto input_data = input.data<scalar_t>();
auto output_data = output.data<scalar_t>();
replication_pad1d_out_batch<scalar_t>(
if (input.ndimension() == 2)
{
AT_DISPATCH_FLOATING_TYPES(
- input.type(), "replication_pad1d_backward", [&] {
+ input.scalar_type(), "replication_pad1d_backward_cpu", [&] {
scalar_t *gradInput_data = gradInput.data<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data<scalar_t>();
else
{
AT_DISPATCH_FLOATING_TYPES(
- input.type(), "replication_pad1d_backward", [&] {
+ input.scalar_type(), "replication_pad1d_backward_cpu", [&] {
scalar_t *gradInput_data = gradInput.data<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data<scalar_t>();
if (input.dim() == 3)
{
output.resize_({nslices, oheight, owidth});
- AT_DISPATCH_FLOATING_TYPES(input.type(), "replication_pad2d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "replication_pad2d_cpu", [&] {
auto input_data = input.data<scalar_t>();
auto output_data = output.data<scalar_t>();
replication_pad2d_out_frame<scalar_t> (input_data, output_data,
else
{
output.resize_({nbatch, nslices, oheight, owidth});
- AT_DISPATCH_FLOATING_TYPES(input.type(), "replication_pad2d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "replication_pad2d_cpu", [&] {
auto input_data = input.data<scalar_t>();
auto output_data = output.data<scalar_t>();
replication_pad2d_out_batch<scalar_t> (input_data, output_data,
if (input.dim() == 3)
{
AT_DISPATCH_FLOATING_TYPES(
- input.type(), "replication_pad2d_backward", [&] {
+ input.scalar_type(), "replication_pad2d_backward_cpu", [&] {
replication_pad2d_backward_out_frame<scalar_t>(
gradInput.data<scalar_t>(),
gradOutput.data<scalar_t>(),
else
{
AT_DISPATCH_FLOATING_TYPES(
- input.type(), "replication_pad2d_backward", [&] {
+ input.scalar_type(), "replication_pad2d_backward_cpu", [&] {
replication_pad2d_backward_out_batch<scalar_t>(
gradInput.data<scalar_t>(),
gradOutput.data<scalar_t>(),
if (input.dim() == 4)
{
output.resize_({nslices, odepth, oheight, owidth});
- AT_DISPATCH_FLOATING_TYPES(input.type(), "replication_pad3d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "replication_pad3d_cpu", [&] {
auto input_data = input.data<scalar_t>();
auto output_data = output.data<scalar_t>();
replication_pad3d_out_frame<scalar_t>(
else
{
output.resize_({nbatch, nslices, odepth, oheight, owidth});
- AT_DISPATCH_FLOATING_TYPES(input.type(), "replication_pad3d", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "replication_pad3d_cpu", [&] {
auto input_data = input.data<scalar_t>();
auto output_data = output.data<scalar_t>();
replication_pad3d_out_batch<scalar_t>(
if (input.dim() == 4)
{
AT_DISPATCH_FLOATING_TYPES(
- input.type(), "replication_pad3d_backward", [&] {
+ input.scalar_type(), "replication_pad3d_backward_cpu", [&] {
replication_pad3d_backward_out_frame<scalar_t> (
gradInput.data<scalar_t>(),
gradOutput.data<scalar_t>(),
else
{
AT_DISPATCH_FLOATING_TYPES(
- input.type(), "replication_pad3d_backward", [&] {
+ input.scalar_type(), "replication_pad3d_backward_cpu", [&] {
replication_pad3d_backward_out_batch<scalar_t> (
gradInput.data<scalar_t>(),
gradOutput.data<scalar_t>(),
Scalar _local_scalar_dense_cpu(const Tensor& self) {
Scalar r;
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(
- at::ScalarType::Half, self.type(), "_local_scalar_dense_cpu", [&] {
+ at::ScalarType::Half, self.scalar_type(), "_local_scalar_dense_cpu", [&] {
scalar_t value = *self.data<scalar_t>();
r = Scalar(value);
});
if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
softmax_lastdim_kernel(kCPU, output, input);
} else {
- AT_DISPATCH_FLOATING_TYPES(input.type(), "softmax", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "softmax", [&] {
host_softmax<scalar_t, false>(output, input, dim);
});
}
if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
log_softmax_lastdim_kernel(kCPU, output, input);
} else {
- AT_DISPATCH_FLOATING_TYPES(input.type(), "log_softmax", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_softmax", [&] {
host_softmax<scalar_t, true>(output, input, dim);
});
}
if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output);
} else {
- AT_DISPATCH_FLOATING_TYPES(grad.type(), "softmax_backward", [&] {
+ AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] {
host_softmax_backward<scalar_t, false>(grad_input, grad, output, dim);
});
}
if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
log_softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output);
} else {
- AT_DISPATCH_FLOATING_TYPES(grad.type(), "log_softmax_backward", [&] {
+ AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "log_softmax_backward", [&] {
host_softmax_backward<scalar_t, true>(grad_input, grad, output, dim);
});
}
}
auto tmp_values = self.clone();
auto tmp_indices = at::empty(self.sizes(), self.options().dtype(kLong));
- AT_DISPATCH_ALL_TYPES(self.type(), "kthvalue", [&] {
+ AT_DISPATCH_ALL_TYPES(self.scalar_type(), "kthvalue_cpu", [&] {
dim_apply(
{tmp_values, tmp_indices, values, indices},
dim,
Tensor
_bincount_cpu(const Tensor& self, const Tensor& weights, int64_t minlength) {
- return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] {
+ return AT_DISPATCH_INTEGRAL_TYPES(self.scalar_type(), "bincount_cpu", [&] {
const auto scalar = weights.scalar_type();
if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
return _bincount_cpu_template<scalar_t, float>(self.contiguous(), weights.contiguous(), minlength);
Tensor _s_where_cpu(const Tensor& condition, const Tensor& self, const Tensor& other) {
Tensor ret = at::empty(self.sizes(), self.options());
- AT_DISPATCH_ALL_TYPES(ret.type(), "where", [&] {
+ AT_DISPATCH_ALL_TYPES(ret.scalar_type(), "where_cpu", [&] {
where_cpu<scalar_t>(ret, condition, self, other);
});
return ret;
result.zero_();
int64_t sz = std::min<int64_t>(n, m);
- AT_DISPATCH_ALL_TYPES(result.type(), "eye", [&]() -> void {
+ AT_DISPATCH_ALL_TYPES(result.scalar_type(), "eye", [&]() -> void {
scalar_t* result_data = result.data<scalar_t>();
for(int64_t i = 0; i < sz; i++) {
result_data[i*(result.strides()[0] + result.strides()[1])] = 1;
AT_CHECK(n >= 0, "n must be non-negative, got", n);
result.resize_({n});
auto gen = get_generator(generator);
- AT_DISPATCH_ALL_TYPES(result.type(), "randperm", [&]() -> void {
+ AT_DISPATCH_ALL_TYPES(result.scalar_type(), "randperm", [&]() -> void {
randperm_cpu<scalar_t>(result, n, gen);
});
//
// 3. sequential RAM + transpose: create an n X 2 Tensor, fill the Tensor
// sequentially, and then transpose it.
- AT_DISPATCH_ALL_TYPES(result.type(), "tril_indices", [&]() -> void {
+ AT_DISPATCH_ALL_TYPES(result.scalar_type(), "tril_indices", [&]() -> void {
// fill the Tensor with correct values
scalar_t* result_data = result.data<scalar_t>();
int64_t i = 0;
// create an empty Tensor with correct size
auto result = at::empty({2, triu_size}, options);
- AT_DISPATCH_ALL_TYPES(result.type(), "triu_indices", [&]() -> void {
+ AT_DISPATCH_ALL_TYPES(result.scalar_type(), "triu_indices", [&]() -> void {
// fill the Tensor with correct values
scalar_t* result_data = result.data<scalar_t>();
int64_t i = 0;
Tensor tensor_cpu(ArrayRef<T> values, const TensorOptions& options) {
auto result = at::empty(values.size(), options);
AT_ASSERT(result.is_contiguous());
- AT_DISPATCH_ALL_TYPES(result.type(), "tensor_cpu", [&] {
+ AT_DISPATCH_ALL_TYPES(result.scalar_type(), "tensor_cpu", [&] {
std::copy(values.begin(), values.end(), result.template data<scalar_t>());
});
return result;
AT_ASSERT(operands_[arg].type);
return *operands_[arg].type;
}
- ScalarType dtype(int arg) const { return type(arg).scalarType(); }
+ ScalarType dtype(int arg=0) const { return type(arg).scalarType(); }
DeviceType device_type(int arg=0) const { return type(arg).device_type(); }
int64_t element_size(int arg) const { return type(arg).elementSizeInBytes(); }
bool is_scalar(int arg) const;
}
}
- AT_DISPATCH_ALL_TYPES(in_tensor.type(), "flip_cpu", [&] {
+ AT_DISPATCH_ALL_TYPES(in_tensor.scalar_type(), "flip_cpu", [&] {
flip_cpu_kernel<scalar_t>(
total_dims,
stride_contiguous_v,
if (self.scalar_type() == ScalarType::Half) {
return true;
}
- return AT_DISPATCH_ALL_TYPES(self.type(), "is_signed", [&]() -> bool {
+ return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "is_signed", [&]() -> bool {
return std::is_signed<scalar_t>();
});
}
std::tuple<Tensor, Tensor>
_unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) {
- return AT_DISPATCH_ALL_TYPES(self.type(), "unique", [&] {
+ return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "unique_cpu", [&] {
return _unique_cpu_template<scalar_t>(self, sorted, return_inverse);
});
}
std::tuple<Tensor, Tensor>
_unique_dim_cpu(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) {
- return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] {
+ return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "unique_dim", [&] {
// The current implementation using `dim` always sorts due to unhashable tensors
return _unique_dim_cpu_template<scalar_t>(self, dim, return_inverse);
});
namespace {
static void threshold_kernel(TensorIterator& iter, Scalar threshold_scalar, Scalar value_scalar) {
- AT_DISPATCH_ALL_TYPES(iter.type(), "threshold", [&] {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "threshold_cpu", [&] {
using Vec = Vec256<scalar_t>;
scalar_t threshold = threshold_scalar.to<scalar_t>();
scalar_t value = value_scalar.to<scalar_t>();
using namespace vec256;
void add_kernel(TensorIterator& iter, Scalar alpha_scalar) {
- AT_DISPATCH_ALL_TYPES(iter.type(), "add", [&]() {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "add_cpu", [&]() {
auto alpha = alpha_scalar.to<scalar_t>();
auto alpha_vec = Vec256<scalar_t>(alpha);
binary_kernel_vec(iter,
}
void mul_kernel(TensorIterator& iter) {
- AT_DISPATCH_ALL_TYPES(iter.type(), "mul", [&]() {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "mul_cpu", [&]() {
binary_kernel_vec(iter,
[=](scalar_t a, scalar_t b) -> scalar_t { return a * b; },
[=](Vec256<scalar_t> a, Vec256<scalar_t> b) {
}
void div_kernel(TensorIterator& iter) {
- if (isIntegralType(iter.type().scalarType())) {
+ if (isIntegralType(iter.dtype())) {
// There's no SIMD integer division, so don't try to vectorize it.
// TODO: if the divisor is a scalar, rewrite as multiplication by a constant.
- AT_DISPATCH_INTEGRAL_TYPES(iter.type(), "div", [&]() {
+ AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "div_cpu", [&]() {
binary_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t {
return a / b;
});
});
} else {
- AT_DISPATCH_FLOATING_TYPES(iter.type(), "div", [&]() {
+ AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "div_cpu", [&]() {
binary_kernel_vec(iter,
[=](scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
return a / b;
static void copy_kernel_impl(Tensor& dst, const Tensor& src) {
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, dst.type(), "copy_kernel_impl", [&]() {
+ at::ScalarType::Half, dst.scalar_type(), "copy_kernel_impl", [&]() {
scalar_t* self_ptr = dst.data<scalar_t>();
scalar_t* src_ptr = src.data<scalar_t>();
};
void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, const double p) {
- AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist", [&] {
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist", [&] {
PDist<scalar_t>::apply(result, self, p);
});
}
static void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
- AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_backward", [&] {
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_backward", [&] {
PDist<scalar_t>::apply_backward(result, grad, self, p, dist);
});
}
static void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, const double p) {
- AT_DISPATCH_FLOATING_TYPES(result.type(), "cdist", [&] {
+ AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "cdist", [&] {
PDist<scalar_t>::apply_cdist(result, x1, x2, p);
});
}
mask_scatter_add(const scalar_t *src, scalar_t* base_addr,
const int_same_size_t<scalar_t> *offsets,
const int_same_size_t<scalar_t> *mask, int64_t len) {
- #ifndef _MSC_VER
- # pragma unroll
+ #ifndef _MSC_VER
+ # pragma unroll
#endif
for (int64_t i = 0; i < len; i++) {
if (mask[i] & 0x01) {
auto i_sw_offset = i_nw_offset + iVec(inp_sH);
auto i_se_offset = i_sw_offset + iVec(inp_sW);
- #ifndef _MSC_VER
- # pragma unroll
+ #ifndef _MSC_VER
+ # pragma unroll
#endif
for (int64_t c = 0; c < C; ++c) {
auto inp_slice_C_ptr = inp_slice[c].data();
scalar_t gInp_corner_arr[Vec::size()];
auto gx = Vec(0), gy = Vec(0);
- #ifndef _MSC_VER
- # pragma unroll
+ #ifndef _MSC_VER
+ # pragma unroll
#endif
for (int64_t c = 0; c < C; ++c) {
auto inp_slice_C_ptr = inp_slice[c].data();
auto out_ptr = out_slice.data() + offset;
auto out_sC = out_slice.stride(0);
auto inp_slice_ptr = inp_slice.data();
- #ifndef _MSC_VER
- # pragma unroll
+ #ifndef _MSC_VER
+ # pragma unroll
#endif
for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) {
// mask_gather zeros out the mask, so we need to make a copy
integer_t gInp_offset_arr[iVec::size()];
i_gInp_offset.store(gInp_offset_arr);
- #ifndef _MSC_VER
- # pragma unroll
+ #ifndef _MSC_VER
+ # pragma unroll
#endif
for (int64_t c = 0; c < C; ++c) {
mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(),
auto spatial_offset = 0;
auto i_offsets_delta = iVec(grid_sW * step);
- #ifndef _MSC_VER
- # pragma unroll
+ #ifndef _MSC_VER
+ # pragma unroll
#endif
for (int64_t h = 0; h < out_H; h++) {
auto grid_ptr_x = grid_ptr + h * grid_sH;
auto grid_ptr_y = grid_ptr_x + grid_sCoor;
auto i_offsets = iVec::arange(0, grid_sW);
- #ifndef _MSC_VER
- # pragma unroll
+ #ifndef _MSC_VER
+ # pragma unroll
#endif
for (int64_t w = 0; w < out_W; w += step) {
auto len = std::min(step, out_W - w);
return; \
}
- AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_cpu_kernel_impl", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_2d_cpu_kernel_impl", [&] {
auto out_acc = output.accessor<scalar_t, 4>();
auto inp_acc = input.accessor<scalar_t, 4>();
auto grid_acc = grid.accessor<scalar_t, 4>();
return; \
}
- AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu_kernel_impl", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_2d_backward_cpu_kernel_impl", [&] {
auto gInp_acc = grad_input.accessor<scalar_t, 4>();
auto gGrid_acc = grad_grid.accessor<scalar_t, 4>();
auto inp_acc = input.accessor<scalar_t, 4>();
}
void index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride) {
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.type(0), "index", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.dtype(), "index_cpu", [&] {
cpu_index_kernel<scalar_t>(iter, index_size, index_stride, [](char* dst, char* src, int64_t offset) {
*(scalar_t*)dst = *(scalar_t*)(src + offset);
});
void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate) {
// NOTE: duplicate indices are only supported if accumulate is true.
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.type(0), "index_put", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.dtype(), "index_put", [&] {
if (accumulate) {
// TODO: investigate parallelization of the accumulate kernel. Unlike the non-accumulate case,
// this needs to be thread-safe.
using namespace vec256;
static void sum_kernel_impl(TensorIterator& iter) {
- AT_DISPATCH_ALL_TYPES(iter.type(), "sum", [&] {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "sum_cpu", [&] {
binary_kernel_reduce_vec(
iter,
[=](scalar_t a, scalar_t b) -> scalar_t { return a + b; },
}
static void mean_kernel_impl(TensorIterator& iter) {
- AT_DISPATCH_ALL_TYPES(iter.type(), "mean", [&] {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "mean_cpu", [&] {
scalar_t factor = scalar_t(iter.num_output_elements()) / iter.numel();
binary_kernel_reduce(
iter,
}
static void std_var_kernel_impl(TensorIterator &iter, bool unbiased, bool take_sqrt) {
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.type(), "std", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "std_cpu", [&] {
binary_kernel_reduce(
iter,
WelfordOps<scalar_t, double, int64_t, double> { unbiased, take_sqrt },
}
static void prod_kernel_impl(TensorIterator& iter) {
- AT_DISPATCH_ALL_TYPES(iter.type(), "prod", [&] {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "prod_cpu", [&] {
binary_kernel_reduce_vec(
iter,
[=](scalar_t a, scalar_t b) -> scalar_t { return a * b; },
if (val == 0) {
- AT_DISPATCH_FLOATING_TYPES(iter.type(), "norm", [&] {
+ AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "norm_cpu", [&] {
binary_kernel_reduce(
iter,
NormZeroOps<scalar_t>(),
);
});
} else if (val == 1) {
- AT_DISPATCH_FLOATING_TYPES(iter.type(), "norm", [&] {
+ AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "norm_cpu", [&] {
binary_kernel_reduce(
iter,
NormOneOps<scalar_t>(),
);
});
} else if (val == INFINITY) {
- AT_DISPATCH_FLOATING_TYPES(iter.type(), "norm", [&] {
+ AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "norm_cpu", [&] {
binary_kernel_reduce(
iter,
AbsMaxOps<scalar_t>(),
);
});
} else if (val == -INFINITY) {
- AT_DISPATCH_FLOATING_TYPES(iter.type(), "norm", [&] {
+ AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "norm_cpu", [&] {
binary_kernel_reduce(
iter,
AbsMinOps<scalar_t>(),
);
});
} else {
- AT_DISPATCH_FLOATING_TYPES(iter.type(), "norm", [&] {
+ AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "norm_cpu", [&] {
binary_kernel_reduce(
iter,
NormOps<scalar_t> { scalar_t(val) },
}
static void min_values_kernel_impl(TensorIterator& iter) {
- AT_DISPATCH_ALL_TYPES(iter.type(), "min_values", [&iter] {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "min_values_cpu", [&iter] {
binary_kernel_reduce_vec(
iter,
[](scalar_t a, scalar_t b) -> scalar_t { return std::min(a, b); },
}
static void max_values_kernel_impl(TensorIterator& iter) {
- AT_DISPATCH_ALL_TYPES(iter.type(), "min_values", [&iter] {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "max_values_cpu", [&iter] {
binary_kernel_reduce_vec(
iter,
[](scalar_t a, scalar_t b) -> scalar_t { return std::max(a, b); },
};
static void softmax_lastdim_kernel_impl(Tensor& result, const Tensor& self) {
- AT_DISPATCH_FLOATING_TYPES(self.type(), "softmax_lastdim_kernel_impl", [&] {
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "softmax_lastdim_kernel_impl", [&] {
vec_host_softmax_lastdim<scalar_t, false>::apply(result, self);
});
}
Tensor& result,
const Tensor& self) {
AT_DISPATCH_FLOATING_TYPES(
- self.type(), "log_softmax_lastdim_kernel_impl", [&] {
+ self.scalar_type(), "log_softmax_lastdim_kernel_impl", [&] {
vec_host_softmax_lastdim<scalar_t, true>::apply(result, self);
});
}
const Tensor& grad,
const Tensor& output) {
AT_DISPATCH_FLOATING_TYPES(
- grad.type(), "softmax_backward_lastdim_kernel_impl", [&] {
+ grad.scalar_type(), "softmax_backward_lastdim_kernel_impl", [&] {
vec_host_softmax_backward_lastdim<scalar_t, false>::apply(
grad_input, grad, output);
});
const Tensor& grad,
const Tensor& output) {
AT_DISPATCH_FLOATING_TYPES(
- grad.type(), "log_softmax_backward_lastdim_kernel_impl", [&] {
+ grad.scalar_type(), "log_softmax_backward_lastdim_kernel_impl", [&] {
vec_host_softmax_backward_lastdim<scalar_t, true>::apply(
grad_input, grad, output);
});
Tensor& max_indices,
const Tensor& self,
c10::optional<int64_t> dim) {
- AT_DISPATCH_ALL_TYPES(self.type(), "max", [&] {
+ AT_DISPATCH_ALL_TYPES(self.scalar_type(), "max", [&] {
Reduction<scalar_t, int64_t>::apply(max, max_indices, self, dim, true);
});
}
Tensor& min_indices,
const Tensor& self,
c10::optional<int64_t> dim) {
- AT_DISPATCH_ALL_TYPES(self.type(), "min", [&] {
+ AT_DISPATCH_ALL_TYPES(self.scalar_type(), "min", [&] {
Reduction<scalar_t, int64_t>::apply(min, min_indices, self, dim, false);
});
}
}
static void sigmoid_kernel(Tensor& result, const Tensor& self) {
- AT_DISPATCH_FLOATING_TYPES(self.type(), "sigmoid", [&] {
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "sigmoid", [&] {
using Vec = Vec256<scalar_t>;
CPU_tensor_parallel_kernel_apply2<scalar_t, scalar_t>(
result,
int64_t n = self.numel();
bool contig = self.is_contiguous();
- AT_DISPATCH_ALL_TYPES(self.type(), "bernoulli_scalar_cpu_", [&] {
+ AT_DISPATCH_ALL_TYPES(self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
at::Tensor tmp_int_tensor;
if (std::is_same<scalar_t, int>::value && contig) {
tmp_int_tensor = self;
#define IMPLEMENT_FLOAT_KERNEL(dispatchtypes, op) \
static void op##_kernel(Tensor& result, const Tensor& self) { \
checkBackend(#op, {result}, Backend::CPU); \
- AT_DISPATCH_##dispatchtypes##_TYPES(self.type(), #op, [&] { \
+ AT_DISPATCH_##dispatchtypes##_TYPES(self.scalar_type(), #op, [&] { \
if (self.is_contiguous() && result.is_contiguous()) { \
vml::v##op( \
result.data<scalar_t>(), self.data<scalar_t>(), self.numel()); \
// case1: shared weight for all channels
if (weight_num == 1) {
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "prelu_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "prelu_cuda", [&] {
prelu_cuda_kernel_share_weights<scalar_t>(
input,
result,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
AT_CHECK(cuda::getApplyGrid(input_numel, grid, curDevice), "prelu: input too large or too many dimensions");
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "prelu_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "prelu_cuda", [&] {
prelu_cuda_kernel_multi_weights<scalar_t>
<<<grid, block, 0, stream>>>(
result.data<scalar_t>(),
Tensor weight_grad_collector = at::empty_like(input);
// case1: shared parameter for all channels
if (weight_num == 1) {
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "prelu_backward_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "prelu_backward_cuda", [&] {
prelu_cuda_backward_kernel_share_weights<scalar_t>(
input,
grad_out,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
AT_CHECK(cuda::getApplyGrid(input_numel, grid, curDevice), "prelu_backward_cuda: input too large or too many dimensions");
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "prelu_backward_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "prelu_backward_cuda", [&] {
prelu_cuda_backward_kernel_multi_weights<scalar_t>
<<<grid, block, 0, stream>>>(
input.data<scalar_t>(),
Tensor hardshrink_cuda(const Tensor & self, Scalar lambd) {
auto out_tensor = at::empty_like(self);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "hardshrink_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "hardshrink_cuda", [&] {
hardshrink_cuda_kernel<scalar_t>(self, out_tensor, lambd.to<scalar_t>());
});
return out_tensor;
Tensor hardshrink_backward_cuda(const Tensor & grad, const Tensor & self, Scalar lambd) {
auto out_tensor = at::empty_like(grad);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "hardshrink_backward_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "hardshrink_backward_cuda", [&] {
hardshrink_backward_cuda_kernel<scalar_t>(self, out_tensor, lambd.to<scalar_t>(), grad);
});
return out_tensor;
}
static void threshold_kernel(TensorIterator& iter, Scalar threshold, Scalar value) {
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.type(), "threshold", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.dtype(), "threshold_cuda", [&] {
threshold_kernel_impl<scalar_t>(iter, threshold.to<scalar_t>(), value.to<scalar_t>());
});
}
output.resize_({sizeD, osizeH, osizeW});
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input_.type(), "adaptive_avg_pool2d", [&] {
+ input_.scalar_type(), "adaptive_avg_pool2d_cuda", [&] {
scalar_t *input_data = input_.data<scalar_t>();
scalar_t *output_data = output.data<scalar_t>();
int64_t osizeH = gradOutput.size(-2);
int64_t osizeW = gradOutput.size(-1);
-
+
int64_t grid_x = sizeD;
if (input.ndimension() == 4) grid_x *= input.size(-4);
//bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input.type(), "adaptive_avg_pool2d_backward", [&] {
+ input.scalar_type(), "adaptive_avg_pool2d_backward_cuda", [&] {
scalar_t *gradOutput_data = gradOutput.data<scalar_t>();
scalar_t *gradInput_data = gradInput.data<scalar_t>();
auto self_working_copy = cloneBatchedColumnMajor(self);
auto A_working_copy = cloneBatchedColumnMajor(A);
std::vector<int64_t> infos(batchCount(self), 0);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "gesv", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "gesv_cuda", [&]{
apply_gesv<scalar_t>(self_working_copy, A_working_copy, infos);
});
if (self.dim() > 2) {
- batchCheckErrors(infos, "gesv");
+ batchCheckErrors(infos, "gesv_cuda");
} else {
- singleCheckErrors(infos[0], "gesv");
+ singleCheckErrors(infos[0], "gesv_cuda");
}
return std::tuple<Tensor, Tensor>(self_working_copy, A_working_copy);
}
std::vector<int64_t> infos(batchCount(self), 0);
auto self_working_copy = cloneBatchedColumnMajor(self);
auto self_inv_working_copy = cloneBatchedColumnMajor(self);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "inverse", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "inverse_cuda", [&]{
apply_inverse<scalar_t>(
self_working_copy, self_inv_working_copy, infos);
});
- batchCheckErrors(infos, "inverse");
+ batchCheckErrors(infos, "inverse_cuda");
return self_inv_working_copy;
}
int64_t info = 0;
auto self_working_copy = cloneBatchedColumnMajor(self);
auto A_working_copy = cloneBatchedColumnMajor(A);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "cholesky_solve", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "cholesky_solve_cuda", [&]{
apply_cholesky_solve<scalar_t>(self_working_copy, A_working_copy, upper, info);
});
AT_CHECK(info == 0, "MAGMA cholesky_solve : invalid argument: ", -info);
self_working_copy = cloneBatchedColumnMajor(self);
}
- AT_DISPATCH_FLOATING_TYPES(self.type(), "cholesky", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "cholesky_cuda", [&]{
apply_cholesky<scalar_t>(self_working_copy, false, infos);
});
if (self.dim() > 2) {
- batchCheckErrors(infos, "cholesky");
+ batchCheckErrors(infos, "cholesky_cuda");
} else {
- singleCheckErrors(infos[0], "cholesky");
+ singleCheckErrors(infos[0], "cholesky_cuda");
}
if (upper) {
return self_working_copy.transpose(-1, -2);
self_row_stride = self.stride(-2), self_col_stride = self.stride(-1);
dim3 dim_block = cuda::getApplyBlock();
dim3 dim_grid((mat_size + dim_block.x - 1) / dim_block.x, n_batches);
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.type(), name, [&]{
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.scalar_type(), name, [&]{
triu_tril_kernel<scalar_t, upper>
<<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
result.data<scalar_t>(), self.data<scalar_t>(), k, mat_size,
}
static void add_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) {
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.type(), "add", [&]() {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.dtype(), "add_cuda", [&]() {
add_kernel_impl<scalar_t>(iter, alpha_scalar);
});
}
}
static void div_kernel_cuda(TensorIterator& iter) {
- if (isIntegralType(iter.type().scalarType())) {
- AT_DISPATCH_INTEGRAL_TYPES(iter.type(), "div", [&]() {
+ if (isIntegralType(iter.dtype())) {
+ AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "div_cuda", [&]() {
div_kernel_impl<scalar_t>(iter);
});
} else if (iter.is_cpu_scalar(2)) {
// optimization for floating-point types: if the second operand is a CPU
// scalar, compute a * reciprocal(b). Note that this may lose one bit of
// precision compared to computing the division.
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.type(), "div", [&]() {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "div_cuda", [&]() {
auto inv_b = scalar_t(1.0 / iter.scalar_value<scalar_t>(2));
iter.remove_operand(2);
div_constant_impl<scalar_t>(iter, inv_b);
});
} else {
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.type(), "div", [&]() {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "div_cuda", [&]() {
div_kernel_impl<scalar_t>(iter);
});
}
}
static void mul_kernel_cuda(TensorIterator& iter) {
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.type(), "mul", [&]() {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.dtype(), "mul_cuda", [&]() {
mul_kernel_impl<scalar_t>(iter);
});
}
Scalar _local_scalar_dense_cuda(const Tensor& self) {
Scalar r;
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, self.type(), "_local_scalar_dense_cuda", [&] {
+ at::ScalarType::Half, self.scalar_type(), "_local_scalar_dense_cuda", [&] {
scalar_t value;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_CUDA_CHECK(cudaMemcpyAsync(&value, self.data<scalar_t>(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream));
cudaMemcpyHostToDevice,
stream));
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.type(), "copy_from_cpu", [&]() {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "copy_from_cpu", [&]() {
copy_device_to_device<scalar_t, scalar_t>(dst, dst_contig);
});
}
CUDAGuard device_guard(dst.device());
CUDAStream stream = getCurrentCUDAStream();
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.type(), "copy_from_cpu_async", [&]() {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "copy_from_cpu_async", [&]() {
AT_CUDA_CHECK(cudaMemcpyAsync(
dst.data<scalar_t>(),
src.data<scalar_t>(),
CUDAGuard device_guard(src.device());
CUDAStream stream = getCurrentCUDAStream();
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.type(), "copy_to_cpu_async", [&]() {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "copy_to_cpu_async", [&]() {
AT_CUDA_CHECK(cudaMemcpyAsync(
dst.data<scalar_t>(),
src.data<scalar_t>(),
template <typename dst_T>
void _copy__cuda(Tensor& dst, const Tensor& src, bool non_blocking) {
AT_CHECK(dst.numel() == src.numel(), "sizes do not match");
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.type(), "_copy__cuda", [&]() {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_copy__cuda", [&]() {
if (dst.is_cuda() && src.is_cuda()) {
copy_device_to_device<dst_T, scalar_t>(dst, src);
} else if (dst.is_cuda()) {
namespace native {
Tensor& _s_copy__cuda(Tensor& self, const Tensor& src, bool non_blocking) {
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.type(), "_copy__cuda", [&]() {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.scalar_type(), "_copy__cuda", [&]() {
::_copy__cuda<scalar_t>(self, src, non_blocking);
});
return self;
const dim3 grid(r1*r2);
const dim3 block(forward_threads);
- AT_DISPATCH_FLOATING_TYPES(x1.type(), "cdist_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES(x1.scalar_type(), "cdist_cuda", [&] {
if (p == 0.0) {
cdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), x1.data<scalar_t>(), x2.data<scalar_t>(), p, r1, r2, m);
} else if (p == 1.0) {
const double n2 = n - .5;
const double n2_squared_minus_1 = n2 * n2 - 1;
- AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_cuda", [&] {
if (p == 0.0) {
pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
} else if (p == 1.0) {
const double n2_squared_minus_1 = n2 * n2 - 1;
Tensor buffer = at::empty({n - 1, result.size(0), result.size(1)}, result.options());
- AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda_backward", [&] {
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_cuda_backward", [&] {
if (p == 1.0) {
pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
} else if (p < 2.0) {
namespace at { namespace native {
Tensor _s_poisson_cuda(const Tensor& lambda, Generator* gen) {
Tensor ret = at::empty(lambda.sizes(), lambda.options());
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "poisson", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.scalar_type(), "poisson_cuda", [&] {
poisson_cuda_kernel<scalar_t>(ret, lambda, next_philox_seed(gen, 20));
});
return ret;
Tensor _s_gamma_cuda(const Tensor& alpha, Generator* gen) {
Tensor ret = at::empty(alpha.sizes(), alpha.options());
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "gamma", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.scalar_type(), "gamma_cuda", [&] {
gamma_cuda_kernel<scalar_t>(ret, alpha, next_philox_seed(gen, 10));
});
return ret;
Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) {
Tensor ret = at::empty(self.sizes(), self.options());
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "_standard_gamma_grad", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "_standard_gamma_grad_cuda", [&] {
gamma_grad_cuda_kernel<scalar_t>(ret, self, output);
});
return ret;
Tensor& bernoulli_tensor_cuda_(Tensor &self, const Tensor& p_, Generator* gen) {
auto p = std::get<0>(expand_inplace(self, p_.to(kCUDA)));
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, self.type(), "bernoulli_tensor_cuda_self_", [&] {
+ at::ScalarType::Half, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] {
const at::Type& p_type = p.type();
using self_t = scalar_t;
auto seeds = next_philox_seed(gen, 10);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(p.type(), "bernoulli_tensor_cuda_p_", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, p.scalar_type(), "bernoulli_tensor_cuda_p_", [&] {
using p_t = scalar_t;
return bernoulli_tensor_cuda_kernel<self_t, p_t>(self, p, seeds);
});
Tensor& bernoulli_scalar_cuda_(Tensor &self, double p, Generator* gen) {
AT_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.type(), "bernoulli_scalar_cuda_", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.scalar_type(), "bernoulli_scalar_cuda_", [&] {
auto seeds = next_philox_seed(gen, 10);
bernoulli_scalar_cuda_kernel<scalar_t>(self, p, seeds);
});
//number of times random will be generated per thread, to offset philox counter in thc random state
int64_t counter_offset = ((nelem - 1)/(block_size*grid.x*UNROLL)+1)*UNROLL;
if (cuda::detail::canUse32BitIndexMath(self)){
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "fused_dropout", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "fused_dropout", [&] {
using accscalar_t = acc_type<scalar_t, true>;
accscalar_t pa = (accscalar_t)(p);
auto self_info = cuda::detail::getTensorInfo<scalar_t, unsigned int>(self);
}
});
} else {
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "fused_dropout", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "fused_dropout", [&] {
using accscalar_t = acc_type<scalar_t, true>;
accscalar_t pa = (accscalar_t)(p);
auto self_info = cuda::detail::getTensorInfo<scalar_t, uint64_t>(self);
Tensor masked_scale_cuda(const Tensor& self, const Tensor& mask, double scale){
Tensor ret = at::empty_like(self);
AT_CHECK(mask.scalar_type() == at::ScalarType::Byte, "mask should be torch.uint8 dtype");
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "masked_scale", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.scalar_type(), "masked_scale", [&] {
using accscalar_t = acc_type<scalar_t, true>;
accscalar_t pa = (accscalar_t)(scale);
masked_scale_kernel<scalar_t>(ret, self, mask, pa);
dim3 block(WARP_SIZE, BLOCKDIMY);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
- (grad.type(),
+ (grad.scalar_type(),
"embedding_backward",
[&]
{
dim3 grid(THCCeilDiv(num_indices, (int64_t) 4), THCCeilDiv(stride, (int64_t) 128));
dim3 block(32, 4);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "embedding_backward", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "embedding_backward", [&] {
embedding_backward_kernel<<<grid, block, 0, stream>>>(
sorted_indices.data<int64_t>(),
orig_indices.data<int64_t>(),
dim3 block(128);
int dim = self.stride(0);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "embedding_backward", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "embedding_backward", [&] {
using accscalar_t = acc_type<scalar_t, true>;
renorm_kernel<<<grid, block, 128 * sizeof(accscalar_t), stream>>>(
self.data<scalar_t>(),
dim3 grid(THCCeilDiv(numel, (ptrdiff_t)4), THCCeilDiv(stride, (int64_t)128));
dim3 block(32, 4);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- grad.type(), "embedding_bag_backward_cuda_sum_avg_kernel", [&] {
+ grad.scalar_type(), "embedding_bag_backward_cuda_sum_avg_kernel", [&] {
EmbeddingBag_accGradParametersKernel_sum_avg<
scalar_t><<<grid, block, 0, stream>>>(
sorted_indices.data<int64_t>(), orig_indices.data<int64_t>(),
int grid = 1024;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- grad.type(), "embedding_bag_backward_cuda_max", [&] {
+ grad.scalar_type(), "embedding_bag_backward_cuda_max", [&] {
EmbeddingBag_accGradParametersKernel_max<
scalar_t><<<grid, block, 0, stream>>>(
max_indices.data<int64_t>(), grad.data<scalar_t>(),
dim3 block = dim3(32, 8);
int grid = 1024;
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(weight.type(), "embedding_bag_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(weight.scalar_type(), "embedding_bag_cuda", [&] {
EmbeddingBag_updateOutputKernel<scalar_t><<<grid, block, 0, stream>>>(
indices.data<int64_t>(), offsets.data<int64_t>(),
weight.data<scalar_t>(), output.data<scalar_t>(),
input_.size(0));
dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(),
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(),
"fractional_max_pool2d_out_cuda_frame",
[&] {
auto devInput = input_.packed_accessor<scalar_t, 4>();
dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
auto devIndices = indices.packed_accessor<int64_t, 4>();
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(gradOutput.type(),
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(gradOutput.scalar_type(),
"fractional_max_pool2d_backward_out_cuda_frame",
[&] {
auto devGradInput = gradInput_.packed_accessor<scalar_t, 4>();
dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input.type(),
+ input.scalar_type(),
"fractional_max_pool3d_out_frame",
[&]{
fractional_max_pool3d_out_frame<scalar_t>
dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- gradOutput.type(),
+ gradOutput.scalar_type(),
"fractional_max_pool3d_backward_out_frame",
[&] {
fractional_max_pool3d_backward_out_frame<scalar_t>
auto output = at::empty({N, input.size(1), H, W}, input.options());
int count = static_cast<int>(N * H * W);
if (count > 0) {
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_cuda", [&] {
grid_sampler_2d_kernel<scalar_t>
<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
count,
auto output = at::empty({N, input.size(1), D, H, W}, input.options());
int count = static_cast<int>(N * D * H * W);
if (count > 0) {
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_cuda", [&] {
grid_sampler_3d_kernel<scalar_t>
<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
count,
auto grad_grid = at::empty_like(grid);
int count = static_cast<int>(N * H * W);
if (count > 0) {
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_backward_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_backward_cuda", [&] {
grid_sampler_2d_backward_kernel<scalar_t>
<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
count,
auto grad_grid = at::empty_like(grid);
int count = static_cast<int>(N * D * H * W);
if (count > 0) {
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_3d_backward_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_backward_cuda", [&] {
grid_sampler_3d_backward_kernel<scalar_t>
<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
count,
}
static void index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride) {
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.type(), "index", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.dtype(), "index_cuda", [&] {
using dtype = OpaqueType<sizeof(scalar_t)>;
index_kernel_impl<dtype>(iter, index_size, index_stride);
});
static void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate) {
AT_ASSERTM(!accumulate, "index_put does not support accumulate=true");
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.type(), "index_put", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, iter.dtype(), "index_put", [&] {
using dtype = OpaqueType<sizeof(scalar_t)>;
index_put_kernel_impl<dtype>(iter, index_size, index_stride);
});
Tensor b_self, b_end, b_weight;
AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
"weight should be of dimension max(self.dim(), end.dim()) or lesser");
- std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_out");
+ std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_out_cuda");
result.resize_as_(b_self);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "lerp_out", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp_out_cuda", [&]{
lerp_cuda<scalar_t>(result, b_self, b_end, b_weight);
});
return result;
Tensor& lerp_cuda_scalar_out(Tensor& result, const Tensor& self,
const Tensor& end, Scalar weight) {
Tensor b_self, b_end;
- std::tie(b_self, b_end) = expand_outplace(self, end, "lerp_out");
+ std::tie(b_self, b_end) = expand_outplace(self, end, "lerp_out_cuda");
result.resize_as_(b_self);
- AT_DISPATCH_FLOATING_TYPES(self.type(), "lerp_out", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp_out_cuda", [&]{
lerp_cuda<scalar_t>(result, b_self, b_end, weight.to<scalar_t>());
});
return result;
Tensor& lerp_cuda_tensor_(Tensor& self, const Tensor& end, const Tensor& weight) {
Tensor b_self, b_end, b_weight;
- std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_");
+ std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp__cuda");
AT_CHECK(b_self.sizes() == self.sizes(),
"output with shape ", self.sizes(),
" doesn't match the broadcast shape ", b_self.sizes());
AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
"weight should be of dimension max(self.dim(), end.dim()) or lesser");
- AT_DISPATCH_FLOATING_TYPES(self.type(), "lerp_", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cuda", [&]{
lerp_cuda<scalar_t>(self, b_self, b_end, b_weight);
});
return self;
Tensor& lerp_cuda_scalar_(Tensor& self, const Tensor& end, Scalar weight) {
Tensor b_self, b_end;
- std::tie(b_self, b_end) = expand_outplace(self, end, "lerp_");
+ std::tie(b_self, b_end) = expand_outplace(self, end, "lerp__cuda");
AT_CHECK(b_self.sizes() == self.sizes(),
"output with shape ", self.sizes(),
" doesn't match the broadcast shape ", b_self.sizes());
- AT_DISPATCH_FLOATING_TYPES(self.type(), "lerp_", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cuda", [&]{
lerp_cuda<scalar_t>(self, b_self, b_end, weight.to<scalar_t>());
});
return self;
Tensor b_self, b_end, b_weight;
AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
"weight should be of dimension max(self.dim(), end.dim()) or lesser");
- std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp");
+ std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_cuda");
Tensor result = at::empty_like(b_self);
- AT_DISPATCH_FLOATING_TYPES(result.type(), "lerp", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp_cuda", [&]{
lerp_cuda<scalar_t>(result, b_self, b_end, b_weight);
});
return result;
Tensor lerp_cuda_scalar(const Tensor& self, const Tensor& end, Scalar weight) {
Tensor b_self, b_end;
- std::tie(b_self, b_end) = expand_outplace(self, end, "lerp");
+ std::tie(b_self, b_end) = expand_outplace(self, end, "lerp_cuda");
Tensor result = at::empty_like(b_self);
- AT_DISPATCH_FLOATING_TYPES(result.type(), "lerp", [&]{
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp_cuda", [&]{
lerp_cuda<scalar_t>(result, b_self, b_end, weight.to<scalar_t>());
});
return result;
Tensor kl_div_backward_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, int64_t reduction) {
auto grad_input = at::zeros_like(input);
Tensor grad_expand = grad.expand_as(input);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "kl_div_backward", [&]() {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "kl_div_backward_cuda", [&]() {
kl_div_backward_kernel<scalar_t>(grad_input, target, grad_expand);
});
if (reduction == Reduction::Mean) {
bool have_three; // flag which of the two cases in eq (6) we have
if (s < 2*target_length+1) {
current_char = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
- have_three = ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) !=
- current_char));
+ have_three = ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) != current_char));
} else {
current_char = BLANK;
have_three = false;
std::tuple<Tensor, Tensor> ctc_loss_gpu(const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t BLANK, bool zero_infinity) {
(void)zero_infinity; // only used for backward
- return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] {
+ return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cuda", [&] {
if (targets.scalar_type() == kLong) {
return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
} else {
Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths,
const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
- return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] {
+ return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cuda", [&] {
if (targets.scalar_type() == kLong) {
return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
} else {
std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda(const Tensor& self, const Tensor& weight, const Tensor& bias,
const Tensor& running_mean, const Tensor& running_var, bool train, double momentum, double epsilon) {
- return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "batch_norm", [&] {
+ return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "batch_norm_cuda", [&] {
if (cuda::detail::canUse32BitIndexMath(self)) {
return batch_norm_cuda_template<scalar_t, int32_t>(self, weight, bias, running_mean, running_var, train, momentum, epsilon);
} else {
std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cuda(const Tensor& grad_out, const Tensor& self, const Tensor& weight, const Tensor& running_mean, const Tensor& running_var,
const Tensor& save_mean, const Tensor& save_invstd, bool train, double epsilon, std::array<bool,3> grad_input_mask) {
- return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "batch_norm_backward", [&] {
+ return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "batch_norm_backward_cuda", [&] {
if (cuda::detail::canUse32BitIndexMath(self)) {
return batch_norm_backward_cuda_template<scalar_t, int32_t>(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, epsilon, grad_input_mask);
} else {
}
std::tuple<Tensor, Tensor> batch_norm_stats_cuda(const Tensor& self, double epsilon) {
- return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "batch_norm_stats", [&] {
+ return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "batch_norm_stats_cuda", [&] {
if (cuda::detail::canUse32BitIndexMath(self)) {
return batch_norm_stats_cuda_template<scalar_t, int32_t>(self, epsilon);
} else {
Tensor batch_norm_elemt_cuda(const Tensor& self, const Tensor& weight, const Tensor& bias,
const Tensor& mean, const Tensor& invstd, double epsilon) {
- return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "batch_norm_elemt", [&] {
+ return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "batch_norm_elemt", [&] {
if (cuda::detail::canUse32BitIndexMath(self)) {
return batch_norm_elemt_cuda_template<scalar_t, int32_t>(self, weight, bias, mean, invstd, epsilon);
} else {
// accepting input(self) here to determine template data types, since running_mean/running_var are optional
std::tuple<Tensor, Tensor> batch_norm_gather_stats_cuda(const Tensor& self, const Tensor& mean, const Tensor& invstd, const Tensor& running_mean,
const Tensor& running_var, double momentum, double epsilon, int64_t count) {
- return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "batch_norm_update_stats", [&] {
+ return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "batch_norm_update_stats_cuda", [&] {
int world_size = mean.size(1);
using accscalar_t = at::acc_type<scalar_t, true>;
if (cuda::detail::canUse32BitIndexMath(self)) {
std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda(const Tensor& self, const Tensor& input, const Tensor& mean,
const Tensor& invstd, bool input_g, bool weight_g, bool bias_g) {
- return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "batch_norm_backward_reduce", [&] {
+ return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "batch_norm_backward_reduce", [&] {
if (cuda::detail::canUse32BitIndexMath(self)) {
return batch_norm_backward_reduce_cuda_template<scalar_t, int32_t>(self, input, mean, invstd, input_g, weight_g, bias_g);
} else {
Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, const Tensor& mean, const Tensor& invstd,
const Tensor& weight, const Tensor& mean_dy, const Tensor& mean_dy_xmu) {
- return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "batch_norm_backward_elemt", [&] {
+ return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "batch_norm_backward_elemt", [&] {
if (cuda::detail::canUse32BitIndexMath(self)) {
return batch_norm_backward_elemt_cuda_template<scalar_t, int32_t>(self, input, mean, invstd, weight, mean_dy, mean_dy_xmu);
} else {
std::tuple<Tensor, Tensor> batch_norm_update_stats_cuda(
const Tensor& self, const Tensor& running_mean, const Tensor& running_var, double momentum) {
- return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "batch_norm_backward", [&] {
+ return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "batch_norm_backward", [&] {
auto mean_st = running_mean.dtype();
auto var_st = running_var.dtype();
AT_CHECK(mean_st == var_st, "running_mean and running_var need to have the same data types");
auto workspace = at::empty_like(input_gates);
auto hy = at::empty_like(cx);
auto cy = at::empty_like(cx);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input_gates.type(), "_thnn_fused_lstm_cell_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input_gates.scalar_type(), "_thnn_fused_lstm_cell_cuda", [&] {
if (canUse32BitIndexMath(workspace)) { // See Note [64-bit index math check elision]
lstm_forward_impl<scalar_t, int32_t>(input_gates, hidden_gates, input_bias, hidden_bias, cx, hy, cy, workspace);
} else {
auto grad_gates = at::empty_like(workspace);
auto grad_cx = at::empty_like(cx);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(workspace.type(), "_thnn_fused_lstm_cell_cuda_backward", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(workspace.scalar_type(), "_thnn_fused_lstm_cell_cuda_backward", [&] {
if (canUse32BitIndexMath(workspace)) { // See Note [64-bit index math check elision]
lstm_backward_impl<scalar_t, int32_t>(grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx);
} else {
auto workspace = at::empty({hx.size(0), hx.size(1) * GRU_WORKSPACE_MULTIPLIER}, hx.options());
auto hy = at::empty_like(hx);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input_gates.type(), "_thnn_fused_gru_cell_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input_gates.scalar_type(), "_thnn_fused_gru_cell_cuda", [&] {
if (canUse32BitIndexMath(workspace)) { // See Note [64-bit index math check elision]
gru_forward_impl<scalar_t, int32_t>(input_gates, hidden_gates, input_bias, hidden_bias, hx, hy, workspace);
} else {
auto grad_input_gates = at::empty({workspace.size(0), hidden_size * 3}, workspace.options());
auto grad_hidden_gates = at::empty({workspace.size(0), hidden_size * 3}, workspace.options());
auto grad_hx = at::empty_like(grad_hy);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_hy.type(), "_thnn_fused_gru_cell_cuda_backward", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_hy.scalar_type(), "_thnn_fused_gru_cell_cuda_backward", [&] {
if (canUse32BitIndexMath(workspace)) { // See Note [64-bit index math check elision]
gru_backward_impl<scalar_t, int32_t>(grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx);
} else {
} else if (steps == 1) {
r.fill_(start);
} else {
- AT_DISPATCH_FLOATING_TYPES(r.type(), "linspace", [&]() {
+ AT_DISPATCH_FLOATING_TYPES(r.scalar_type(), "linspace_cuda", [&]() {
scalar_t scalar_start = start.to<scalar_t>();
scalar_t scalar_end = end.to<scalar_t>();
scalar_t step = (scalar_end - scalar_start) / static_cast<scalar_t>(steps - 1);
} else if (steps == 1) {
r.fill_(std::pow(10.0, start.to<double>()));
} else {
- AT_DISPATCH_FLOATING_TYPES(r.type(), "logspace", [&]() {
+ AT_DISPATCH_FLOATING_TYPES(r.scalar_type(), "logspace_cuda", [&]() {
scalar_t scalar_start = start.to<scalar_t>();
scalar_t scalar_end = end.to<scalar_t>();
scalar_t step = (scalar_end - scalar_start) / static_cast<scalar_t>(steps - 1);
}
Tensor& range_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, result.type(), "range", [&]() {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, result.scalar_type(), "range_cuda", [&]() {
using accscalar_t = at::acc_type<scalar_t, true>;
auto xstart = start.to<accscalar_t>();
auto xend = end.to<accscalar_t>();
}
Tensor& arange_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, result.type(), "arange", [&]() {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, result.scalar_type(), "arange_cuda", [&]() {
using accscalar_t = at::acc_type<scalar_t, true>;
auto xstart = start.to<accscalar_t>();
auto xend = end.to<accscalar_t>();
}
static void std_var_kernel_cuda(TensorIterator& iter, bool unbiased, bool take_sqrt) {
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.type(), "std", [&]() {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "std", [&]() {
std_var_kernel_impl<scalar_t>(iter, unbiased, take_sqrt);
});
}
}
static void sum_kernel_cuda(TensorIterator& iter) {
- if (iter.type().scalarType() == kHalf) {
+ if (iter.dtype() == kHalf) {
return sum_kernel_impl<at::Half, float>(iter);
- } else if (iter.type(1).scalarType() == kHalf && iter.type().scalarType() == kFloat) {
+ } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) {
// type promotion that does cast and reduction in a single kernel
return sum_kernel_impl<at::Half, float, float>(iter);
}
- AT_DISPATCH_ALL_TYPES(iter.type(), "sum", [&]() {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "sum_cuda", [&]() {
sum_kernel_impl<scalar_t>(iter);
});
}
static void prod_kernel_cuda(TensorIterator& iter) {
- if (iter.type().scalarType() == kHalf) {
+ if (iter.dtype() == kHalf) {
return prod_kernel_impl<at::Half, float>(iter);
}
- AT_DISPATCH_ALL_TYPES(iter.type(), "prod", [&]() {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "prod_cuda", [&]() {
prod_kernel_impl<scalar_t>(iter);
});
}
static void mean_kernel_cuda(TensorIterator& iter) {
- if (iter.type().scalarType() == kHalf) {
+ if (iter.dtype() == kHalf) {
return mean_kernel_impl<at::Half, float>(iter);
- } else if (iter.type(1).scalarType() == kHalf && iter.type().scalarType() == kFloat) {
+ } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) {
// type promotion that does cast and reduction in a single kernel
return mean_kernel_impl<at::Half, float, float>(iter);
}
- AT_DISPATCH_ALL_TYPES(iter.type(), "mean", [&]() {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "mean_cuda", [&]() {
mean_kernel_impl<scalar_t>(iter);
});
}
static void norm_kernel_cuda(TensorIterator& iter, Scalar p) {
- if (iter.type().scalarType() == kHalf) {
+ if (iter.dtype() == kHalf) {
return norm_kernel_cuda_impl<at::Half, float>(iter, p);
- } else if (iter.type(1).scalarType() == kHalf && iter.type().scalarType() == kFloat) {
+ } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) {
// type promotion that does cast and reduction in a single kernel
return norm_kernel_cuda_impl<at::Half, float, float>(iter, p);
}
- AT_DISPATCH_FLOATING_TYPES(iter.type(), "norm", [&]() {
+ AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "norm_cuda", [&]() {
norm_kernel_cuda_impl<scalar_t>(iter, p);
});
}
}
void max_values_kernel_cuda(TensorIterator& iter) {
- AT_DISPATCH_ALL_TYPES(iter.type(), "max_values", [&]() {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "max_values_cuda", [&]() {
max_values_kernel_cuda_impl<scalar_t>(iter);
});
}
void min_values_kernel_cuda(TensorIterator& iter) {
- AT_DISPATCH_ALL_TYPES(iter.type(), "min_values", [&]() {
+ AT_DISPATCH_ALL_TYPES(iter.dtype(), "min_values_cuda", [&]() {
min_values_kernel_cuda_impl<scalar_t>(iter);
});
}
Tensor input = input_.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input.type(), "reflection_pad1d_out_template", [&] {
+ input.scalar_type(), "reflection_pad1d_out_template", [&] {
reflection_pad1d_out_kernel<<<
grid_size, block_size, 0, at::cuda::getCurrentCUDAStream()>>>(
input.data<scalar_t>(), output.data<scalar_t>(),
dim3 grid_size((int) ::ceil(output_w / 256.0), nplane, nbatch);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- grad_input.type(), "reflection_pad1d_backward_out_template", [&] {
+ grad_input.scalar_type(), "reflection_pad1d_backward_out_template", [&] {
reflection_pad1d_backward_out_kernel<<<
grid_size, block_size, 0, at::cuda::getCurrentCUDAStream()>>>(
grad_input.data<scalar_t>(), grad_output.data<scalar_t>(),
(int) std::ceil(output_plane_size/256.0), nplane, nbatch);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input.type(), "reflection_pad2d_out_template", [&] {
+ input.scalar_type(), "reflection_pad2d_out_template", [&] {
reflection_pad2d_out_kernel<<<
grid_size, block_size, 0, at::cuda::getCurrentCUDAStream()>>>(
input.data<scalar_t>(), output.data<scalar_t>(),
(int) std::ceil(output_plane_size/256.0), nplane, nbatch);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input.type(), "reflection_pad2d_backward_out_template", [&] {
+ input.scalar_type(), "reflection_pad2d_backward_out_template", [&] {
reflection_pad2d_backward_out_kernel<<<
grid_size, block_size, 0, at::cuda::getCurrentCUDAStream()>>>(
grad_input.data<scalar_t>(), grad_output.data<scalar_t>(),
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input.type(), "replication_pad1d", [&] {
+ input.scalar_type(), "replication_pad1d_cuda", [&] {
if (numInputDims == 2) {
gradInput.zero_();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input.type(), "replication_pad1d_backward", [&] {
+ input.scalar_type(), "replication_pad1d_backward_cuda", [&] {
auto gradInput_ = gradInput;
auto gradOutput_ = gradOutput;
" Calculated output H: ", outputH, " W: ", outputW);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input.type(), "replication_pad2d", [&] {
+ input.scalar_type(), "replication_pad2d_cuda", [&] {
if (numInputDims == 3) {
dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
replication_pad_forward_kernel2d <<<gridSize, blockSize, 0,
- at::cuda::getCurrentCUDAStream()>>>(devInput, devOutput,
+ at::cuda::getCurrentCUDAStream()>>>(devInput, devOutput,
padT, padB, padL, padR);
}
}
int padL = paddingSize[0];
int padR = paddingSize[1];
int padT = paddingSize[2];
- int padB = paddingSize[3];
+ int padB = paddingSize[3];
int planeDim = 0;
int dimh = 1;
int dimw = 2;
gradInput.zero_();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input.type(), "replication_pad2d_backward", [&] {
+ input.scalar_type(), "replication_pad2d_backward_cuda", [&] {
auto gradInput_ = gradInput;
auto gradOutput_ = gradOutput;
int pleft, int pright,
int ptop, int pbottom,
int pfront, int pback) {
- AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
+ AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
"input tensor must fit into 32-bit index math");
int numInputDims = input.dim();
int pleft, int pright,
int ptop, int pbottom,
int pfront, int pback) {
- AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
+ AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
"input tensor must fit into 32-bit index math");
int numInputDims = input.dim();
int ptop = paddingSize[2];
int pbottom = paddingSize[3];
int pfront = paddingSize[4];
- int pback = paddingSize[5];
+ int pback = paddingSize[5];
shapeCheck3d(input, pleft, pright, ptop,
pbottom, pfront, pback);
int outputW = inputW + pleft + pright;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input.type(), "replication_pad3d", [&] {
+ input.scalar_type(), "replication_pad3d_cuda", [&] {
if (numInputDims == 4) {
output.resize_({numPlanes, outputD, outputH, outputW});
int ptop = paddingSize[2];
int pbottom = paddingSize[3];
int pfront = paddingSize[4];
- int pback = paddingSize[5];
+ int pback = paddingSize[5];
shapeAndGradOutputCheck3d(input, gradOutput, pleft, pright, ptop,
pbottom, pfront, pback);
gradInput.zero_();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
- input.type(), "replication_pad3d_backward", [&] {
+ input.scalar_type(), "replication_pad3d_backward_cuda", [&] {
auto gradInput_ = gradInput;
auto gradOutput_ = gradOutput;
const int ILP = 2;
dim3 grid(outer_size);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "host_softmax", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "host_softmax", [&] {
using accscalar_t = acc_type<scalar_t, true>;
if (!half_to_float) {
cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
} else {
uint32_t smem_size;
dim3 grid, block;
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "host_softmax", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "host_softmax", [&] {
using accscalar_t = acc_type<scalar_t, true>;
if (!half_to_float) {
SpatialSoftMax_getLaunchSizes<accscalar_t>(
const int ILP = 2;
dim3 grid(outer_size);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(gI.type(), "host_softmax_backward", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(gI.scalar_type(), "host_softmax_backward", [&] {
using accscalar_t = acc_type<scalar_t, true>;
if (!half_to_float) {
cunn_SoftMaxBackward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
} else {
uint32_t smem_size;
dim3 grid, block;
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "host_softmax_backward", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "host_softmax_backward", [&] {
using accscalar_t = acc_type<scalar_t, true>;
if (!half_to_float) {
SpatialSoftMax_getLaunchSizes<accscalar_t>(
int64_t k,
int64_t dim,
bool keepdim) {
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.type(), "kthvalue", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.scalar_type(), "kthvalue_cuda", [&] {
kthvalue_cuda_template<scalar_t>(values, indices, self, k, dim, keepdim);
});
return std::forward_as_tuple(values, indices);
}
Tensor median_cuda(const Tensor& self) {
- return AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.type(), "median", [&] {
+ return AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.scalar_type(), "median", [&] {
return median_cuda_template<scalar_t>(self);
});
}
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
auto policy = thrust::cuda::par(allocator).on(stream);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "_fft_fill_with_conjugate_symmetry_", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "_fft_fill_with_conjugate_symmetry_", [&] {
typedef thrust::device_ptr<scalar_t> device_ptr;
typedef thrust::counting_iterator<int64_t> counter;
typedef thrust::transform_iterator<cnt_to_dst_idx_functor, counter> dst_idx_iterator;
const Tensor& self,
const Tensor& weights,
int64_t minlength) {
- return AT_DISPATCH_INTEGRAL_TYPES(self.type(), "bincount", [&] {
+ return AT_DISPATCH_INTEGRAL_TYPES(self.scalar_type(), "bincount_cuda", [&] {
const auto scalar = weights.scalar_type();
if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
return _bincount_cuda_template<scalar_t, float>(self, weights, minlength);
if (self.scalar_type() == ScalarType::Half) {
AT_ERROR("HalfTensor is not supported");
}
- return AT_DISPATCH_ALL_TYPES(self.type(), "histc", [&] {
+ return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "histc", [&] {
return _histc_cuda_template<scalar_t>(self, nbins, min.to<scalar_t>(), max.to<scalar_t>());
});
}
const Tensor& self,
const Tensor& other) {
Tensor ret = at::empty(self.sizes(), self.options());
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, ret.type(), "where", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, ret.scalar_type(), "where_cuda", [&] {
where_cuda<scalar_t>(ret, condition, self, other);
});
return ret;
} else {
// Generate random values for the keys array
AT_DISPATCH_ALL_TYPES(
- result.type(), "randperm_out_cuda", [&] {
+ result.scalar_type(), "randperm_out_cuda", [&] {
auto keys = at::empty(result.sizes(), result.options()).random_(generator);
auto result_data = thrust::device_ptr<scalar_t>(result.data<scalar_t>());
cuda::getApplyGrid(tril_size, dim_grid, tensor.get_device()),
"unable to get dim grid");
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, tensor.type(), "tril_indices_cuda", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, tensor.scalar_type(), "tril_indices_cuda", [&] {
tril_indices_kernel<<<
dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
tensor.data<scalar_t>(),
cuda::getApplyGrid(triu_size, dim_grid, tensor.get_device()),
"unable to get dim grid");
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, tensor.type(), "triu_indices_cuda", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, tensor.scalar_type(), "triu_indices_cuda", [&] {
triu_indices_kernel<<<
dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
tensor.data<scalar_t>(),
// use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work
if (flip_dims_size == 1 && in_tensor.is_contiguous() && (flip_dims[0] == 0 || flip_dims[0] == total_dims - 1)) {
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, in_tensor.type(), "flip_cuda", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, in_tensor.scalar_type(), "flip_cuda", [&] {
auto in_tensor_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(in_tensor);
auto out_tensor_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(out_tensor);
int flip_dim = in_tensor_info.collapseDims(flip_dims[0]);
}
}
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, in_tensor.type(), "flip_cuda", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, in_tensor.scalar_type(), "flip_cuda", [&] {
flip_cuda_kernel<<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
in_tensor.data<scalar_t>(), out_tensor.data<scalar_t>(), N, flip_dims_t.toType(CUDA(kLong)).data<int64_t>(), flip_dims_size,
strides_t.toType(CUDA(kLong)).data<int64_t>(), stride_contiguous.toType(CUDA(kLong)).data<int64_t>(), shape_t.toType(CUDA(kLong)).data<int64_t>(), total_dims);
auto total_dims = in_tensor.dim();
- AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, in_tensor.type(), "roll_cuda", [&] {
+ AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, in_tensor.scalar_type(), "roll_cuda", [&] {
roll_cuda_kernel<<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
in_tensor.data<scalar_t>(), out_tensor.data<scalar_t>(), N,
dim, start,
std::tuple<Tensor, Tensor>
_unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) {
- return AT_DISPATCH_ALL_TYPES(self.type(), "unique", [&] {
+ return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "unique_cuda", [&] {
// The current CUDA implementation of unique always sort due to the
// lack of hashtable implementation in thrust
return _unique_cuda_template<scalar_t>(self, return_inverse);
std::tuple<Tensor, Tensor>
_unique_dim_cuda(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) {
- return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] {
+ return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "unique_dim", [&] {
return _unique_dim_cuda_template<scalar_t>(self, dim, return_inverse);
});
}
#include <THC/THCDeviceUtils.cuh>
#include <THC/THCTensorMathReduce.cuh>
-namespace at {
+namespace at {
namespace native {
namespace {
// Currently, kernels are non-persistent.
// Dialing up the block size to, say 1024, can improve performance by
// increase the amount of cache available per block, which can improve cache hit rate.
-// However, this is less efficient for short rows. 256 is pretty versatile.
+// However, this is less efficient for short rows. 256 is pretty versatile.
// May be worth implementing heuristics later.
#define BLOCK 256
// Block size for weight_norm_*_last_dim_kernel.
-// This is tricker than the first_dim case because we must make blocks
+// This is tricker than the first_dim case because we must make blocks
// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
-// Since output-element parallelism is along the fast dimension, this reduces the number of
-// blocks we can launch by 16X.
+// Since output-element parallelism is along the fast dimension, this reduces the number of
+// blocks we can launch by 16X.
#define TILE_W 16
// Somewhat versatile strategy: max out intra-block parallelism by extending
// blocks across the slow dimension up to the hardware-max block size of 1024.
template<typename T, typename ReduceOp>
__device__ __forceinline__ void reduce_block_into_lanes
- (T *x,
- T val,
+ (T *x,
+ T val,
int lanes, // lanes is intended to be <= 32.
- ReduceOp reduceOp)
-{
+ ReduceOp reduceOp)
+{
int tid = threadIdx.x + threadIdx.y*blockDim.x;
int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
x[tid] = val;
__syncthreads();
}
-
+
#pragma unroll
- for(int i = (blockSize >> 1); i >= 64; i >>= 1)
+ for(int i = (blockSize >> 1); i >= 64; i >>= 1)
{
if(tid < i)
x[tid] = reduceOp(x[tid], x[tid+i]);
__syncthreads();
}
- if(tid < 32)
+ if(tid < 32)
{
T final;
if(blockSize >= 64)
for(int i = 16; i >= lanes; i >>= 1)
final = reduceOp(final, WARP_SHFL_DOWN(final, i));
- if(tid < lanes)
+ if(tid < lanes)
x[tid] = final; // EpilogueOp
}
}
template
- <typename scalar_t,
+ <typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_fwd_first_dim_kernel
(scalar_t* __restrict__ w,
accscalar_t* __restrict__ norms,
const scalar_t* __restrict__ v,
const scalar_t* __restrict__ g,
- const int rowSize)
+ const int rowSize)
{
// We are norming each slowest-dim row of the tensor separately.
// For now, assign one block to each row.
// extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
extern __shared__ char buf[];
accscalar_t* s = (accscalar_t*)buf;
-
+
accscalar_t thread_sum = 0.f;
- for(int i = tid; i < rowSize; i += stride )
+ for(int i = tid; i < rowSize; i += stride )
{
- accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]);
+ accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]);
thread_sum += val_f*val_f; // AccumOp, could do Kahan here
}
accscalar_t result = s[0];
result = sqrtf(result);
-
+
if(tid == 0)
norms[row] = result;
accscalar_t rnorm = 1.f/result; // for consistency with backward kernel
// Write data to output
- for(int i = tid; i < rowSize; i += stride )
+ for(int i = tid; i < rowSize; i += stride )
{
accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]);
w[i+rowStart] = scalar_cast<scalar_t>(g_this_row*val_f*rnorm);
}
template
- <typename scalar_t,
+ <typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_fwd_last_dim_kernel
(
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
- accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]);
+ accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]);
thread_sum += val_f*val_f; // AccumOp, could do Kahan here
currentIdx += blockDim.y*fast_dim_size;
- slower_dims_location += blockDim.y;
+ slower_dims_location += blockDim.y;
}
- reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
+ reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
// Better to pass an EpilogueOp to reduce_block_into_lanes?
if(threadIdx.y == 0)
norms[fast_dim_location] = norm_this_col;
rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
}
-
- __syncthreads();
- accscalar_t g_this_col = scalar_cast<accscalar_t>(g[fast_dim_location]);
- accscalar_t rnorm = rnorms_this_block[threadIdx.x];
+ __syncthreads();
+
+ accscalar_t g_this_col = scalar_cast<accscalar_t>(g[fast_dim_location]);
+ accscalar_t rnorm = rnorms_this_block[threadIdx.x];
slower_dims_location = threadIdx.y;
currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
- accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]);
+ accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]);
w[currentIdx] = scalar_cast<scalar_t>(g_this_col*val_f*rnorm);
currentIdx += blockDim.y*fast_dim_size;
- slower_dims_location += blockDim.y;
- }
+ slower_dims_location += blockDim.y;
+ }
}
template
- <typename scalar_t,
+ <typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_bwd_first_dim_kernel
(scalar_t* __restrict__ grad_v,
// extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
extern __shared__ char buf[];
accscalar_t* s = (accscalar_t*)buf;
-
+
accscalar_t thread_sum = 0.f;
- for(int i = tid; i < rowSize; i += stride )
+ for(int i = tid; i < rowSize; i += stride )
{
- accscalar_t grad_wi = scalar_cast<accscalar_t>(grad_w[i+rowStart]);
- accscalar_t saved_vi = scalar_cast<accscalar_t>(saved_v[i+rowStart]);
+ accscalar_t grad_wi = scalar_cast<accscalar_t>(grad_w[i+rowStart]);
+ accscalar_t saved_vi = scalar_cast<accscalar_t>(saved_v[i+rowStart]);
thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here
}
// Could choose to save reciprocal of norm instead I suppose, but norms is probably
// more handy to keep around.
// Broadcast load; could use shared memory instead.
- accscalar_t rnorm = 1.f/saved_norms[row];
+ accscalar_t rnorm = 1.f/saved_norms[row];
accscalar_t rnorm3 = rnorm*rnorm*rnorm;
// Write g gradients.
// Broadcast load, could use shared memory instead.
accscalar_t g_this_row = scalar_cast<accscalar_t>(saved_g[row]);
-
- // Write v gradients. We are reusing values that were loaded earlier, so there
+
+ // Write v gradients. We are reusing values that were loaded earlier, so there
// is an optimization opportunity here (store values persistently).
- for(int j = tid; j < rowSize; j += stride )
+ for(int j = tid; j < rowSize; j += stride )
{
- accscalar_t grad_wj = scalar_cast<accscalar_t>(grad_w[j+rowStart]);
- accscalar_t saved_vj = scalar_cast<accscalar_t>(saved_v[j+rowStart]);
+ accscalar_t grad_wj = scalar_cast<accscalar_t>(grad_w[j+rowStart]);
+ accscalar_t saved_vj = scalar_cast<accscalar_t>(saved_v[j+rowStart]);
accscalar_t grad_vj = g_this_row*(rnorm*grad_wj - rnorm3*saved_vj*result);
grad_v[j+rowStart] = scalar_cast<scalar_t>(grad_vj);
}
}
-template
- <typename scalar_t,
+template
+ <typename scalar_t,
typename accscalar_t>
__global__ void weight_norm_bwd_last_dim_kernel
(scalar_t* __restrict__ grad_v,
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
- accscalar_t grad_wi = scalar_cast<accscalar_t>(grad_w[currentIdx]);
- accscalar_t saved_vi = scalar_cast<accscalar_t>(saved_v[currentIdx]);
+ accscalar_t grad_wi = scalar_cast<accscalar_t>(grad_w[currentIdx]);
+ accscalar_t saved_vi = scalar_cast<accscalar_t>(saved_v[currentIdx]);
thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here
currentIdx += blockDim.y*fast_dim_size;
- slower_dims_location += blockDim.y;
+ slower_dims_location += blockDim.y;
}
- reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
+ reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
accscalar_t result = s[threadIdx.x];
// Broadcast load; could use shared memory instead.
- accscalar_t rnorm = 1.f/saved_norms[fast_dim_location];
+ accscalar_t rnorm = 1.f/saved_norms[fast_dim_location];
accscalar_t rnorm3 = rnorm*rnorm*rnorm;
// Write g gradients.
if(fast_dim_location < fast_dim_size)
while(slower_dims_location < slower_dims_size)
{
- accscalar_t grad_wj = scalar_cast<accscalar_t>(grad_w[currentIdx]);
- accscalar_t saved_vj = scalar_cast<accscalar_t>(saved_v[currentIdx]);
+ accscalar_t grad_wj = scalar_cast<accscalar_t>(grad_w[currentIdx]);
+ accscalar_t saved_vj = scalar_cast<accscalar_t>(saved_v[currentIdx]);
accscalar_t grad_vj = g_this_col*(rnorm*grad_wj - rnorm3*saved_vj*result);
grad_v[currentIdx] = scalar_cast<scalar_t>(grad_vj);
currentIdx += blockDim.y*fast_dim_size;
- slower_dims_location += blockDim.y;
- }
+ slower_dims_location += blockDim.y;
+ }
}
} // anonymous namespace
std::tuple<Tensor,Tensor> weight_norm_cuda
(const Tensor & v,
const Tensor & g,
- int64_t dim)
+ int64_t dim)
{
auto w = at::empty_like(v);
// sends the unpacked g.data() as the argument. In other words, we expect "g" is a bare Tensor here.
// norms is only needed to stash for backward.
- // g.scalar_type() may be at::ScalarType::Double, Float, or Half.
+ // g.scalar_type() may be at::ScalarType::Double, Float, or Half.
// If Half, stash norms as float.
at::ScalarType AccType = g.scalar_type() == at::ScalarType::Half ?
at::ScalarType::Float : g.scalar_type();
- // Will this create norms on the same device as g, regardless of what the thread's default
+ // Will this create norms on the same device as g, regardless of what the thread's default
// current device is? I believe so, because Type::* functions are DeviceGuard()ed.
auto norms = at::empty_strided(g.sizes(), g.strides(), g.options().dtype(AccType));
const int ndims = v.dim();
- if(dim == 0)
+ if(dim == 0)
{
// Find logical size of each flattened slowest-dim row
int rowSize = 1;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
- (v.type(),
- "weight_norm_fwd_first_dim_kernel",
+ (v.scalar_type(),
+ "weight_norm_fwd_first_dim_kernel",
[&]
{
using accscalar_t = acc_type<scalar_t, true>;
weight_norm_fwd_first_dim_kernel<scalar_t, accscalar_t>
- <<<v.size(0),
- BLOCK,
+ <<<v.size(0),
+ BLOCK,
BLOCK*sizeof(accscalar_t),
stream>>>
- (w.data<scalar_t>(),
+ (w.data<scalar_t>(),
norms.data<accscalar_t>(),
- v.data<scalar_t>(),
- g.data<scalar_t>(),
+ v.data<scalar_t>(),
+ g.data<scalar_t>(),
rowSize);
});
}
slower_dims_size *= v.size(i);
int fast_dim_size = v.size(ndims-1);
-
+
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
- (v.type(),
- "weight_norm_fwd_last_dim_kernel",
+ (v.scalar_type(),
+ "weight_norm_fwd_last_dim_kernel",
[&]
{
using accscalar_t = acc_type<scalar_t, true>;
-
+
weight_norm_fwd_last_dim_kernel<scalar_t, accscalar_t>
<<<(fast_dim_size+TILE_W-1)/TILE_W,
dim3(TILE_W,TILE_H),
// The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
// not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught
- // until a later error check on a synchronizing CUDA call. Unfortunately, without manually
+ // until a later error check on a synchronizing CUDA call. Unfortunately, without manually
// synchronizing here, this is the best we can do.
THCudaCheck(cudaGetLastError());
}
std::tuple<Tensor, Tensor> weight_norm_cuda_backward
- (const Tensor & grad_w,
- const Tensor & saved_v,
- const Tensor & saved_g,
+ (const Tensor & grad_w,
+ const Tensor & saved_v,
+ const Tensor & saved_g,
const Tensor & saved_norms,
int64_t dim)
{
const int ndims = saved_v.dim();
- if(dim == 0)
+ if(dim == 0)
{
// Find logical size of each flattened slowest-dim row
int rowSize = 1;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
- (saved_v.type(),
- "weight_norm_bwd_first_dim_kernel",
+ (saved_v.scalar_type(),
+ "weight_norm_bwd_first_dim_kernel",
[&]
{
using accscalar_t = acc_type<scalar_t, true>;
weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>
- <<<grad_w.size(0),
- BLOCK,
+ <<<grad_w.size(0),
+ BLOCK,
BLOCK*sizeof(accscalar_t),
stream>>>
(grad_v.data<scalar_t>(),
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
- (saved_v.type(),
- "weight_norm_bwd_last_dim_kernel",
+ (saved_v.scalar_type(),
+ "weight_norm_bwd_last_dim_kernel",
[&]
{
using accscalar_t = acc_type<scalar_t, true>;
weight_norm_bwd_last_dim_kernel<scalar_t, accscalar_t>
<<<(fast_dim_size+TILE_W-1)/TILE_W,
- dim3(TILE_W,TILE_H),
+ dim3(TILE_W,TILE_H),
(TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
stream>>>
(grad_v.data<scalar_t>(),
// The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
// not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught
- // until a later error check on a synchronizing CUDA call. Unfortunately, without manually
+ // until a later error check on a synchronizing CUDA call. Unfortunately, without manually
// synchronizing here, this is the best we can do.
THCudaCheck(cudaGetLastError());
Tensor& _baddbmm_mkl_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
// checks are done in native/LinearAlgebra.cpp
- AT_DISPATCH_FLOATING_TYPES(self.type(), "baddbmm__mkl", [&] {
+ AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "baddbmm__mkl", [&] {
baddbmm_mkl_template<scalar_t>(self, batch1, batch2, beta, alpha);
});
{
int tid = omp_get_thread_num();
int64_t start = tid * num_slices_per_thread;
- AT_DISPATCH_FLOATING_TYPES(input.type(), "_fft_fill_with_conjugate_symmetry", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "_fft_fill_with_conjugate_symmetry", [&] {
_fft_fill_with_conjugate_symmetry_slice<scalar_t>(input, signal_ndim, size_last_dim,
last_dim_start_slice, start, std::min(num_slices_per_thread, num - start));
});
return;
}
#endif
- AT_DISPATCH_FLOATING_TYPES(input.type(), "_fft_fill_with_conjugate_symmetry", [&] {
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "_fft_fill_with_conjugate_symmetry", [&] {
_fft_fill_with_conjugate_symmetry_slice<scalar_t>(input, signal_ndim, size_last_dim,
last_dim_start_slice, 0, num);
});
}} // namespace at::native
#endif
-
// TODO: it seems like sparse_dim == 0 could be supported even if self.dim() > 0,
// but this would take some work and doesn't seem particularly useful.
AT_CHECK(sparse_dim > 0 || self.dim() == 0, "sparse_dim must be >0 if dimensionality > 0");
- AT_CHECK(sparse_dim <= dims,
+ AT_CHECK(sparse_dim <= dims,
"sparse_dim must be less than or equal to self.dim()");
at::TensorOptions sparse_options = self.options().layout(kSparse);
std::vector<int64_t> sizes = self.sizes().vec();
int64_t i = -1;
AT_DISPATCH_ALL_TYPES(
- values.type(), "coalesce", [&] {
+ values.scalar_type(), "coalesce", [&] {
int64_t prev = -1;
int64_t blockSize = values.stride(0);
scalar_t* values_ptr = values.data<scalar_t>();
// TODO: Re-audit this; it used to be an indexSelect directly into r_values
at::index_select_out(r_values, t_view, 0, indices);
} else {
- AT_DISPATCH_ALL_TYPES(r_values.type(), "sparse_mask", [&] {
+ AT_DISPATCH_ALL_TYPES(r_values.scalar_type(), "sparse_mask", [&] {
sparse_mask_out_cpu_kernel<scalar_t>(
r_values,
t,
auto src_indices_accessor = src_indices.accessor<int64_t, 2>();
AT_DISPATCH_ALL_TYPES(
- t_values.type(), "cadd_sparse", [&] {
+ t_values.scalar_type(), "cadd_sparse", [&] {
scalar_t* t_values_ptr = t_values.data<scalar_t>();
scalar_t* s_values_ptr = s_values.data<scalar_t>();
scalar_t* r_values_ptr = r_values.data<scalar_t>();
}
} else {
AT_DISPATCH_ALL_TYPES(
- values.type(), "add_dense_sparse", [&] {
+ values.scalar_type(), "add_dense_sparse", [&] {
add_dense_sparse_worker_cpu<scalar_t>(r, value, sparse, indices, values);
});
}
}
} else {
AT_DISPATCH_ALL_TYPES(
- r_values.type(), "mul_out_sparse", [&] {
+ r_values.scalar_type(), "mul_out_sparse", [&] {
auto r_accessor = r_values.accessor<scalar_t, 1>();
auto t_accessor = t_values.accessor<scalar_t, 1>();
auto s_accessor = s_values.accessor<scalar_t, 1>();
Tensor values = sparse_._values();
AT_DISPATCH_ALL_TYPES(
- values.type(), "addmm_sparse_dense", [&] {
+ values.scalar_type(), "addmm_sparse_dense", [&] {
s_addmm_out_sparse_dense_worker<scalar_t>(nnz, dim_i, dim_j, dim_k, r, beta, t, alpha, indices, values, dense);
}
);
int64_t newv_stride0 = newv.stride(0);
AT_DISPATCH_ALL_TYPES(
- values.type(), "sspmm", [&] {
+ values.scalar_type(), "sspmm", [&] {
auto values_accessor = values.accessor<scalar_t, 1>();
scalar_t* dense_ptr = dense.data<scalar_t>();
scalar_t* newv_ptr = newv.data<scalar_t>();
dim3 grid(THCCeilDiv(newNnz, (int64_t) 4), THCCeilDiv(stride, (int64_t) 128));
dim3 block(32, 4);
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half,values.type(), "coalesce_sparse_cuda", [&] {
+ at::ScalarType::Half,values.scalar_type(), "coalesce_sparse_cuda", [&] {
using cuda_accscalar_t = acc_type<scalar_t, /* is_cuda */ true>;
apply::coalesceValuesKernel<scalar_t, cuda_accscalar_t><<<grid, block, 0, stream>>>(
uniqueOffsets.data<int64_t>(),
// No half support, so we don't have to use CUDATypeConversion
Tensor r__;
AT_DISPATCH_FLOATING_TYPES(
- values.type(), "addmm_sparse_cuda", [&] {
+ values.scalar_type(), "addmm_sparse_cuda", [&] {
scalar_t cast_beta = beta.to<scalar_t>();
scalar_t cast_alpha = alpha.to<scalar_t>();
if (cast_beta == 0) {
AT_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, values.type(), "add_out_dense_sparse_cuda", [&] {
+ at::ScalarType::Half, values.scalar_type(), "add_out_dense_sparse_cuda", [&] {
apply::sparseElementwiseKernelScalar<TensorCAddOp<scalar_t>, uint64_t, scalar_t>
<<<grid, block, 0, stream>>>(
TensorCAddOp<scalar_t>(value.to<scalar_t>()),
values = values.contiguous();
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, values.type(), "add_out_dense_sparse_cuda", [&] {
+ at::ScalarType::Half, values.scalar_type(), "add_out_dense_sparse_cuda", [&] {
apply::sparseElementwiseKernel<TensorCAddOp<scalar_t>, uint64_t, scalar_t>
<<<grid, block, 0, stream>>>(
TensorCAddOp<scalar_t>(value.to<scalar_t>()),
// FIXME: at some point we can wrap the scale into indexAdd
// NB: Purposely not inplace!
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, values.type(), "add_out_dense_sparse_cuda", [&] {
+ at::ScalarType::Half, values.scalar_type(), "add_out_dense_sparse_cuda", [&] {
if (value.to<scalar_t>() != static_cast<scalar_t>(1)) {
values = values.mul(value);
}
Tensor s_values_ = src._values();
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, s_values_.type(), "add_out_sparse_cuda", [&] {
+ at::ScalarType::Half, s_values_.scalar_type(), "add_out_sparse_cuda", [&] {
if (value.to<scalar_t>() != static_cast<scalar_t>(1)) {
s_values_ = s_values_.mul(value);
}
LongTensor resultNnz = at::empty({1}, CUDA(kLong));
AT_DISPATCH_ALL_TYPES_AND(
- at::ScalarType::Half, t_values_.type(), "mul_out_sparse_cuda", [&] {
+ at::ScalarType::Half, t_values_.scalar_type(), "mul_out_sparse_cuda", [&] {
apply::valueSparseIntersectionKernel<TensorMulOp<scalar_t>, uint64_t, scalar_t>
<<<grid, block, 0, stream>>>(
TensorMulOp<scalar_t>(),
auto input_indices_ti = getTensorInfo<int64_t, int64_t>(input_indices_1D);
auto input_indices_pos_ti = getTensorInfo<int64_t, int64_t>(input_indices_pos);
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_values.type(), "_sparse_sum_backward_cuda", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_values.scalar_type(), "_sparse_sum_backward_cuda", [&] {
auto grad_values_expand_ti = getTensorInfo<scalar_t, int64_t>(grad_values_expand);
auto grad_input_values_ti = getTensorInfo<scalar_t, int64_t>(grad_input_values);
auto zero_dim = at::empty({}, type);
zero_dim.fill_(2);
zero_dim.exp_();
- AT_DISPATCH_FLOATING_TYPES(zero_dim.type(), "test0", [&] {
+ AT_DISPATCH_FLOATING_TYPES(zero_dim.scalar_type(), "test0", [&] {
ASSERT(zero_dim.data<scalar_t>()[0] == std::exp(2));
});
}
}
- AT_DISPATCH_FLOATING_TYPES(a0.type(), "test1", [&] {
+ AT_DISPATCH_FLOATING_TYPES(a0.scalar_type(), "test1", [&] {
CPU_tensor_apply2<scalar_t, scalar_t>(
a0, a1, [](scalar_t& y, const scalar_t& x) { y = x * x; });
CPU_tensor_apply2<double, scalar_t>(
}
});
- AT_DISPATCH_FLOATING_TYPES(a0.type(), "test2", [&] {
+ AT_DISPATCH_FLOATING_TYPES(a0.scalar_type(), "test2", [&] {
CPU_tensor_apply3<scalar_t, scalar_t, scalar_t>(
a0, a1, a2, [](scalar_t& y, const scalar_t& x, const scalar_t& z) {
y = x * x + z;
}
});
- AT_DISPATCH_FLOATING_TYPES(a0.type(), "test3", [&] {
+ AT_DISPATCH_FLOATING_TYPES(a0.scalar_type(), "test3", [&] {
CPU_tensor_apply4<scalar_t, scalar_t, scalar_t, scalar_t>(
a0,
a1,
ASSERT_EQ(scalar_to_tensor(ones({}).item()).scalar_type(), kDouble);
if (x.scalar_type() != ScalarType::Half) {
- AT_DISPATCH_ALL_TYPES(x.type(), "foo", [&] {
+ AT_DISPATCH_ALL_TYPES(x.scalar_type(), "foo", [&] {
scalar_t s = 1;
std::stringstream ss;
ASSERT_NO_THROW(
torch::Tensor half_test(torch::Tensor input) {
auto output = torch::empty(1, input.options().dtype(torch::kFloat));
- AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "half_test", [&] {
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "half_test", [&] {
half_test_kernel<scalar_t><<<1, 1>>>(
input.data<scalar_t>(),
output.data<float>());
static PyObject* THPFInfo_eps(THPFInfo* self, void*) {
return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
- at::CPU(self->type), "epsilon", [] {
+ self->type, "epsilon", [] {
return PyFloat_FromDouble(
std::numeric_limits<
at::scalar_value_type<scalar_t>::type>::epsilon());
}
static PyObject* THPFInfo_max(THPFInfo* self, void*) {
- return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(at::CPU(self->type), "max", [] {
+ return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self->type, "max", [] {
return PyFloat_FromDouble(
std::numeric_limits<at::scalar_value_type<scalar_t>::type>::max());
});
}
static PyObject* THPFInfo_min(THPFInfo* self, void*) {
- return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(at::CPU(self->type), "min", [] {
+ return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self->type, "min", [] {
return PyFloat_FromDouble(
std::numeric_limits<at::scalar_value_type<scalar_t>::type>::lowest());
});
}
static PyObject* THPIInfo_max(THPFInfo* self, void*) {
- return AT_DISPATCH_INTEGRAL_TYPES(at::CPU(self->type), "max", [] {
+ return AT_DISPATCH_INTEGRAL_TYPES(self->type, "max", [] {
return THPUtils_packInt64(std::numeric_limits<scalar_t>::max());
});
}
static PyObject* THPIInfo_min(THPFInfo* self, void*) {
- return AT_DISPATCH_INTEGRAL_TYPES(at::CPU(self->type), "min", [] {
+ return AT_DISPATCH_INTEGRAL_TYPES(self->type, "min", [] {
return THPUtils_packInt64(std::numeric_limits<scalar_t>::lowest());
});
}
static PyObject* THPFInfo_tiny(THPFInfo* self, void*) {
- return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(at::CPU(self->type), "min", [] {
+ return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self->type, "min", [] {
return PyFloat_FromDouble(
std::numeric_limits<at::scalar_value_type<scalar_t>::type>::min());
});