ins[1],
outs[0],
conv_dtype), name="y")
+
+def softmax(x, axis=-1):
+ """Compute softmax using CuDNN
+
+ Parameters
+ ----------
+ x : tvm.te.Tensor
+ The input tensor
+
+ axis : int
+ The axis to compute the softmax
+
+ Returns
+ -------
+ ret : tvm.te.Tensor
+ The result tensor
+ """
+ return te.extern(
+ x.shape, [x],
+ lambda ins, outs: tvm.tir.call_packed(
+ "tvm.contrib.cudnn.softmax.forward",
+ ins[0],
+ outs[0],
+ axis), name="y")
# softmax
-reg.register_schedule("nn.softmax", strategy.schedule_softmax)
+reg.register_strategy("nn.softmax", strategy.softmax_strategy)
reg.register_pattern("nn.softmax", OpPattern.OPAQUE)
# log_softmax
-reg.register_schedule("nn.log_softmax", strategy.schedule_softmax)
+reg.register_schedule("nn.log_softmax", strategy.schedule_log_softmax)
reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)
"""Attributes for nn.dense"""
+@tvm._ffi.register_object("relay.attrs.SoftmaxAttrs")
+class SoftmaxAttrs(Attrs):
+ """Attributes for nn.softmax"""
+
+
@tvm._ffi.register_object("relay.attrs.FIFOBufferAttrs")
class FIFOBufferAttrs(Attrs):
"""Attributes for nn.fifo_buffer"""
with target:
return topi.cuda.schedule_adaptive_pool(outs)
-@schedule_softmax.register(["cuda", "gpu"])
-def schedule_softmax_cuda(attrs, outs, target):
- """schedule softmax for cuda"""
+@softmax_strategy.register(["cuda", "gpu"])
+def softmax_strategy_cuda(attrs, inputs, out_type, target):
+ """softmax cuda strategy"""
+ strategy = _op.OpStrategy()
+ strategy.add_implementation(
+ wrap_compute_softmax(topi.nn.softmax),
+ wrap_topi_schedule(topi.cuda.schedule_softmax),
+ name="softmax.cuda")
+ if target.target_name == "cuda" and "cudnn" in target.libs:
+ strategy.add_implementation(
+ wrap_compute_softmax(topi.cuda.softmax_cudnn),
+ wrap_topi_schedule(topi.cuda.schedule_softmax_cudnn),
+ name="softmax.cudnn",
+ plevel=15)
+ return strategy
+
+@schedule_log_softmax.register(["cuda", "gpu"])
+def schedule_log_softmax_cuda(attrs, outs, target):
+ """scheudle log_softmax for cuda"""
with target:
return topi.cuda.schedule_softmax(outs)
return topi.generic.schedule_adaptive_pool(outs)
# softmax
+def wrap_compute_softmax(topi_compute):
+ """Wrap softmax topi compute"""
+ def _compute_softmax(attrs, inputs, out_type):
+ axis = attrs.get_int("axis")
+ return [topi_compute(inputs[0], axis)]
+ return _compute_softmax
+
+@override_native_generic_func("softmax_strategy")
+def softmax_strategy(attrs, inputs, out_type, target):
+ """softmax generic strategy"""
+ strategy = _op.OpStrategy()
+ strategy.add_implemenation(
+ wrap_compute_softmax(topi.nn.softmax),
+ wrap_topi_schedule(topi.generic.schedule_softmax),
+ name="softmax.generic")
+ return strategy
+
+# log_softmax
@generic_func
-def schedule_softmax(attrs, outs, target):
- """Schedule softmax"""
+def schedule_log_softmax(attrs, outs, target):
+ """Schedule log_softmax op"""
with target:
return topi.generic.schedule_softmax(outs)
with target:
return topi.hls.schedule_adaptive_pool(outs)
-@schedule_softmax.register("hls")
-def schedule_softmax_hls(attrs, outs, target):
- """schedule softmax for hls"""
+@softmax_strategy.register("hls")
+def softmax_strategy_hls(attrs, inputs, out_type, target):
+ """softmax hls strategy"""
+ strategy = _op.OpStrategy()
+ strategy.add_implementation(
+ wrap_compute_softmax(topi.nn.softmax),
+ wrap_topi_schedule(topi.hls.schedule_softmax),
+ name="softmax.hls")
+ return strategy
+
+@schedule_log_softmax.register("hls")
+def schedule_log_softmax_hls(attrs, inputs, out_type, target):
+ """schedule log_softmax for hls"""
with target:
return topi.hls.schedule_softmax(outs)
with target:
return topi.opengl.schedule_adaptive_pool(outs)
-@schedule_softmax.register("opengl")
-def schedule_softmax_opengl(attrs, outs, target):
- """schedule softmax for opengl"""
+@softmax_strategy.register("opengl")
+def softmax_strategy_opengl(attrs, inputs, out_type, target):
+ """softmax opengl strategy"""
+ strategy = _op.OpStrategy()
+ strategy.add_implementation(
+ wrap_compute_softmax(topi.nn.softmax),
+ wrap_topi_schedule(topi.opengl.schedule_softmax),
+ name="softmax.opengl")
+ return strategy
+
+@schedule_log_softmax.register("opengl")
+def schedule_log_softmax_opengl(attrs, outs, target):
+ """schedule log_softmax for opengl"""
with target:
return topi.opengl.schedule_softmax(outs)
with target:
return topi.x86.schedule_adaptive_pool(outs)
-@schedule_softmax.register("cpu")
-def schedule_softmax_cpu(attrs, outs, target):
- """schedule softmax for x86"""
+@softmax_strategy.register("cpu")
+def softmax_strategy_cpu(attrs, inputs, out_type, target):
+ """softmax x86 strategy"""
+ strategy = _op.OpStrategy()
+ strategy.add_implementation(
+ wrap_compute_softmax(topi.nn.softmax),
+ wrap_topi_schedule(topi.x86.schedule_softmax),
+ name="softmax.x86")
+ return strategy
+
+@schedule_log_softmax.register("cpu")
+def schedule_log_softmax_cpu(attrs, outs, target):
+ """schedule log_softmax op for x86"""
with target:
return topi.x86.schedule_softmax(outs)
.set_num_inputs(1)
.add_argument("data", "Tensor", "The input tensor.")
.set_support_level(1)
-.add_type_rel("Identity", IdentityRel)
-.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
- const Array<te::Tensor>& inputs,
- const Type& out_type) {
- const auto* param = attrs.as<SoftmaxAttrs>();
- CHECK(param != nullptr);
- return Array<te::Tensor>{ topi::nn::softmax(inputs[0], param->axis) };
-});
+.add_type_rel("Identity", IdentityRel);
// relay.nn.log_softmax
workspace_size = 0;
}
+// SoftmaxEntry
+
+SoftmaxEntry::SoftmaxEntry() {
+ CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc));
+}
+
+SoftmaxEntry::~SoftmaxEntry() {
+ CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc));
+}
+
} // namespace contrib
} // namespace tvm
void CleanWorkspace();
}; // ConvThreadEntry
+struct SoftmaxEntry {
+ cudnnSoftmaxMode_t mode;
+ cudnnDataType_t data_type;
+ cudnnTensorDescriptor_t shape_desc;
+ SoftmaxEntry();
+ ~SoftmaxEntry();
+}; // SoftmaxEntry
struct CuDNNThreadEntry {
CuDNNThreadEntry();
~CuDNNThreadEntry();
cudnnHandle_t handle{nullptr};
ConvEntry conv_entry;
+ SoftmaxEntry softmax_entry;
runtime::DeviceAPI *cuda_api{nullptr};
static CuDNNThreadEntry* ThreadLocal();
}; // CuDNNThreadEntry
--- /dev/null
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/cudnn/softmax.cc
+ * \brief Use external cudnn softmax function
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/device_api.h>
+#include "cudnn_utils.h"
+
+namespace tvm {
+namespace contrib {
+
+using namespace runtime;
+
+TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.softmax.forward")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+ DLTensor* x = args[0];
+ DLTensor* y = args[1];
+ int axis = args[2];
+ int ndim = x->ndim;
+ int64_t* shape = x->shape;
+ if (axis < 0) axis += ndim;
+ CHECK(axis >= 0 && axis < ndim);
+
+ CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal();
+ entry_ptr->softmax_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(x->dtype);
+
+ // Set mode and shape descriptor
+ if (axis == ndim - 1) {
+ int64_t N = 1;
+ for (int i = 0; i < ndim - 1; ++i) {
+ N *= shape[i];
+ }
+ entry_ptr->softmax_entry.mode = CUDNN_SOFTMAX_MODE_INSTANCE;
+ CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->softmax_entry.shape_desc,
+ CUDNN_TENSOR_NCHW,
+ entry_ptr->softmax_entry.data_type,
+ static_cast<int>(N),
+ static_cast<int>(shape[ndim - 1]),
+ 1,
+ 1));
+ } else {
+ int64_t pre_axis_dim = 1;
+ int64_t post_axis_dim = 1;
+ for (int i = 0; i < ndim; ++i) {
+ if (i < axis) {
+ pre_axis_dim *= shape[i];
+ } else if (i > axis) {
+ post_axis_dim *= shape[i];
+ }
+ }
+ entry_ptr->softmax_entry.mode = CUDNN_SOFTMAX_MODE_CHANNEL;
+ CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->softmax_entry.shape_desc,
+ CUDNN_TENSOR_NCHW,
+ entry_ptr->softmax_entry.data_type,
+ static_cast<int>(pre_axis_dim),
+ static_cast<int>(shape[axis]),
+ static_cast<int>(post_axis_dim),
+ 1));
+ }
+
+ auto alpha = CuDNNDataType::GetConst<1>(entry_ptr->softmax_entry.data_type);
+ auto beta = CuDNNDataType::GetConst<0>(entry_ptr->softmax_entry.data_type);
+ CUDNN_CALL(cudnnSoftmaxForward(entry_ptr->handle,
+ CUDNN_SOFTMAX_ACCURATE,
+ entry_ptr->softmax_entry.mode,
+ alpha,
+ entry_ptr->softmax_entry.shape_desc,
+ x->data,
+ beta,
+ entry_ptr->softmax_entry.shape_desc,
+ y->data));
+});
+
+} // namespace contrib
+} // namespace tvm
def test_conv3d():
verify_conv3d("float32", "float32", tensor_format=0)
+
+def verify_softmax(shape, axis, dtype="float32"):
+ A = te.placeholder(shape, dtype=dtype, name='A')
+ B = cudnn.softmax(A, axis)
+ s = te.create_schedule([B.op])
+
+ ctx = tvm.gpu(0)
+ a_np = np.random.uniform(size=shape).astype(dtype)
+ b_np = topi.testing.softmax_python(a_np)
+ a = tvm.nd.array(a_np, ctx)
+ b = tvm.nd.array(b_np, ctx)
+ f = tvm.build(s, [A, B], "cuda", target_host="llvm", name="softmax")
+ f(a, b)
+ tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3)
+
+def verify_softmax_4d(shape, dtype="float32"):
+ A = te.placeholder(shape, dtype=dtype, name='A')
+ B = cudnn.softmax(A, axis=1)
+ s = te.create_schedule([B.op])
+
+ ctx = tvm.gpu(0)
+ n, c, h, w = shape
+ a_np = np.random.uniform(size=shape).astype(dtype)
+ b_np = topi.testing.softmax_python(a_np.transpose(0, 2, 3, 1).reshape(h*w, c))
+ b_np = b_np.reshape(n, h, w, c).transpose(0, 3, 1, 2)
+ a = tvm.nd.array(a_np, ctx)
+ b = tvm.nd.array(b_np, ctx)
+ f = tvm.build(s, [A, B], "cuda", target_host="llvm", name="softmax")
+ f(a, b)
+ tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3)
+
+def test_softmax():
+ if not tvm.runtime.enabled("cuda"):
+ print("skip because cuda is not enabled...")
+ return
+ if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
+ print("skip because cudnn is not enabled...")
+ return
+
+ verify_softmax((32, 10), -1)
+ verify_softmax((3, 4), -1)
+ verify_softmax((1, 5), -1, "float64")
+ verify_softmax_4d((1, 16, 256, 256))
+ verify_softmax_4d((1, 16, 256, 256), "float64")
+
if __name__ == "__main__":
test_conv2d()
test_conv3d()
+ test_softmax()
from .conv3d_winograd import *
from . import conv3d_alter_op
from .reduction import schedule_reduce
-from .softmax import schedule_softmax
+from .softmax import *
from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
from .dense import *
from .pooling import *
# pylint: disable=invalid-name, unused-variable, trailing-whitespace
"""Schedule for softmax operator"""
from tvm import te
+from tvm.contrib import cudnn
+from .. import generic
from .injective import schedule_injective_from_existing
s[softmax].bind(tx, thread_x)
return s
+
+
+def softmax_cudnn(x, axis=-1):
+ """Perform softmax on the data using cudnn"""
+ return cudnn.softmax(x, axis)
+
+
+def schedule_softmax_cudnn(outs):
+ """Schedule for softmax cudnn op"""
+ return generic.schedule_extern(outs)
return te.compute(shape, lambda *indices: _normalize(exp, expsum, *indices),
name='T_softmax_norm', attrs={"axis" : axis})
-
@tvm.te.tag_scope(tag='log_softmax_output')
def log_softmax(x):
"""Perform log softmax activation on the data