import tvm
from tvm import autotvm
from tvm.autotvm.task.space import SplitEntity
+from tvm.contrib import cblas
from .util import get_fp32_len
from .. import generic, tag, nn
# Declare dense compute with packing weight into cache-friendly layout
@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack")
def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None):
- if out_dtype is None:
- out_dtype = data.dtype
- batch, in_dim = get_const_tuple(data.shape)
- out_dim, _ = get_const_tuple(weight.shape)
- # create tuning space
- cfg.define_split("tile_y", batch, num_outputs=3)
- cfg.define_split("tile_x", out_dim, num_outputs=3)
- cfg.define_split("tile_k", in_dim, num_outputs=2)
- if cfg.is_fallback:
- _default_dense_pack_config(cfg, batch, out_dim, in_dim)
-
- packw_bn = cfg["tile_x"].size[-1]
- packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
- packw = tvm.compute(packw_shape,
- lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
-
- k = tvm.reduce_axis((0, in_dim), name="k")
- C = tvm.compute((batch, out_dim),
- lambda y, x: tvm.sum(
- data[y, k].astype(out_dtype) *
- packw[x // packw_bn, k, x % packw_bn].astype(out_dtype),
- axis=k),
- tag="dense_pack")
+ target = tvm.target.current_target()
+ if "cblas" in target.libs:
+ C = cblas.matmul(data, weight, False, True)
+ else:
+ if out_dtype is None:
+ out_dtype = data.dtype
+ batch, in_dim = get_const_tuple(data.shape)
+ out_dim, _ = get_const_tuple(weight.shape)
+ # create tuning space
+ cfg.define_split("tile_y", batch, num_outputs=3)
+ cfg.define_split("tile_x", out_dim, num_outputs=3)
+ cfg.define_split("tile_k", in_dim, num_outputs=2)
+ if cfg.is_fallback:
+ _default_dense_pack_config(cfg, batch, out_dim, in_dim)
+
+ packw_bn = cfg["tile_x"].size[-1]
+ packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
+ packw = tvm.compute(packw_shape,
+ lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
+
+ k = tvm.reduce_axis((0, in_dim), name="k")
+ C = tvm.compute((batch, out_dim),
+ lambda y, x: tvm.sum(
+ data[y, k].astype(out_dtype) *
+ packw[x // packw_bn, k, x % packw_bn].astype(out_dtype),
+ axis=k),
+ tag="dense_pack")
if bias is not None:
C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
tag=tag.BROADCAST)
# Declare dense compute without packing weight
@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack")
def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
- if out_dtype is None:
- out_dtype = data.dtype
- batch, in_dim = get_const_tuple(data.shape)
- out_dim, _ = get_const_tuple(weight.shape)
- # create tuning space
- cfg.define_split("tile_x", out_dim, num_outputs=2)
- cfg.define_split("tile_y", batch, num_outputs=2)
- cfg.define_split("tile_k", in_dim, num_outputs=2)
- if cfg.is_fallback:
- _default_dense_nopack_config(cfg, batch, out_dim, in_dim)
-
- vec = cfg["tile_k"].size[-1]
- k = tvm.reduce_axis((0, in_dim // vec), "k")
- CC = tvm.compute((batch, out_dim, vec),
- lambda z, y, x: tvm.sum(
- data[z, k * vec + x].astype(out_dtype) *
- weight[y, k * vec + x].astype(out_dtype), axis=k))
-
- kk = tvm.reduce_axis((0, vec), "kk")
- C = tvm.compute((batch, out_dim),
- lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
- tag="dense_nopack")
+ target = tvm.target.current_target()
+ if "cblas" in target.libs:
+ C = cblas.matmul(data, weight, False, True)
+ else:
+ if out_dtype is None:
+ out_dtype = data.dtype
+ batch, in_dim = get_const_tuple(data.shape)
+ out_dim, _ = get_const_tuple(weight.shape)
+ # create tuning space
+ cfg.define_split("tile_x", out_dim, num_outputs=2)
+ cfg.define_split("tile_y", batch, num_outputs=2)
+ cfg.define_split("tile_k", in_dim, num_outputs=2)
+ if cfg.is_fallback:
+ _default_dense_nopack_config(cfg, batch, out_dim, in_dim)
+
+ vec = cfg["tile_k"].size[-1]
+ k = tvm.reduce_axis((0, in_dim // vec), "k")
+ CC = tvm.compute((batch, out_dim, vec),
+ lambda z, y, x: tvm.sum(
+ data[z, k * vec + x].astype(out_dtype) *
+ weight[y, k * vec + x].astype(out_dtype), axis=k))
+
+ kk = tvm.reduce_axis((0, vec), "kk")
+ C = tvm.compute((batch, out_dim),
+ lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
+ tag="dense_nopack")
if bias is not None:
C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
tag=tag.BROADCAST)
@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack")
def _schedule_dense_pack(cfg, outs):
+ target = tvm.target.current_target()
+ if "cblas" in target.libs:
+ return generic.schedule_extern(outs)
+
s = tvm.create_schedule([x.op for x in outs])
def _callback(op):
@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack")
def _schedule_dense_nopack(cfg, outs):
+ target = tvm.target.current_target()
+ if "cblas" in target.libs:
+ return generic.schedule_extern(outs)
+
s = tvm.create_schedule([x.op for x in outs])
def _callback(op):