+
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
from tvm.contrib.nvcc import have_fp16, have_int8
from tvm.contrib import nvcc
+tx = tvm.thread_axis("threadIdx.x")
+bx = tvm.thread_axis("blockIdx.x")
+
+
def test_cuda_vectorize_add():
num_thread = 8
def check_cuda(dtype, n, lanes):
B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B')
s = tvm.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
- s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
- s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+ s[B].bind(xo, bx)
+ s[B].bind(xi, tx)
fun = tvm.build(s, [A, B], "cuda")
ctx = tvm.gpu(0)
a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(
lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
s = tvm.create_schedule(D.op)
xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
- s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
- s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
+ s[D].bind(xo, bx)
+ s[D].bind(xi, tx)
fun = tvm.build(s, [A, B, C, D], "cuda")
np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
np_b = np.random.randint(low=-128, high=127, size=(n,lanes))
A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
B = tvm.compute((n,), lambda i: A[i], name='B')
s = tvm.create_schedule(B.op)
- bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
- s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
- s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+ block, thread = s[B].split(B.op.axis[0], factor=num_thread)
+ s[B].bind(block, bx)
+ s[B].bind(thread, tx)
fun = tvm.build(s, [A, B], "cuda", name="vector_load")
np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
s = tvm.create_schedule(A.op)
y, x = s[A].op.axis
s[A].vectorize(x)
- s[A].bind(y, tvm.thread_axis("blockIdx.x"))
+ s[A].bind(y, bx)
fun = tvm.build(s, [A], "cuda", name="make_int8x4")
np_a = np.full((n, lanes), value, dtype=dtype)
a = tvm.nd.empty(np_a.shape, dtype, ctx)
inf_value = tvm.const(value, dtype=dtype)
C = tvm.compute((n,), lambda i: inf_value, name='C')
s = tvm.create_schedule(C.op)
- s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x"))
+ s[C].bind(s[C].op.axis[0], tx)
fun = tvm.build(s, [A, C], target)
a = tvm.nd.empty((n,), A.dtype, ctx)
c = tvm.nd.empty((n,), A.dtype, ctx)
module(nda, ndb, ndc)
tvm.testing.assert_allclose(ndc.asnumpy(), ref)
+
+def test_cuda_reducition_binding():
+ if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+ print("skip because cuda is not enabled..")
+ return
+
+ k = tvm.reduce_axis((0, 32), 'k')
+ A = tvm.placeholder((96, 32), name='A')
+ B = tvm.compute( (96,), lambda m:
+ tvm.sum(A[m, k], axis=k),
+ name='B')
+ s = tvm.create_schedule(B.op)
+
+ s[B].reorder(B.op.reduce_axis[0], B.op.axis[0])
+
+ mo, _ = s[B].split(B.op.axis[0], 32)
+ s[B].bind(mo, tvm.thread_axis("blockIdx.x"))
+
+ fcuda = tvm.build(s, [A, B], "cuda")
+
+def test_rfactor_predicates():
+ if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+ print("skip because cuda is not enabled..")
+ return
+
+ n = tvm.reduce_axis((0, 129), 'n')
+ A = tvm.placeholder((129,), name='A')
+ B = tvm.compute( (1, ), lambda b:
+ tvm.sum(A[n],
+ axis=n),
+ name='B'
+ )
+
+ s = tvm.create_schedule(B.op)
+
+ _, ni = s[B].split(s[B].op.reduce_axis[0], factor=8)
+
+ BF = s.rfactor(B, ni, 0)
+ s[B].set_store_predicate(tx.var.equal(0))
+
+ s[B].bind(s[B].op.reduce_axis[0], tx)
+ s[B].bind(s[B].op.axis[0], bx)
+
+ s[BF].compute_at(s[B], s[B].op.axis[0])
+
+ _, noi = s[BF].split(s[BF].op.reduce_axis[0], factor=2)
+
+ BF2 = s.rfactor(BF, noi, 0)
+
+ s[BF].bind(s[BF].op.axis[0], tx)
+ s[BF2].compute_at(s[BF], s[BF].op.axis[1])
+
+ fcuda = tvm.build(s, [A, B], "cuda")
+
+
if __name__ == "__main__":
test_cuda_vectorize_add()
test_cuda_multiply_add()
test_cuda_make_int8x4()
test_cuda_inf_nan()
test_cuda_shuffle()
+ test_cuda_reducition_binding()
+ test_rfactor_predicates()
tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy(), rtol=1e-5)
tvm.testing.assert_allclose(d.asnumpy(), a.asnumpy(), rtol=1e-5)
+def test_simple_rfactor():
+ K = 16*4+4
+ k = tvm.reduce_axis((0, K), 'k')
+
+ A = tvm.placeholder((1, K), name='A')
+
+ B = tvm.compute( (1,), lambda b:
+ tvm.sum(A[b, k], axis=k),
+ name='B'
+ )
+
+ s = tvm.create_schedule(B.op)
+ ko, _ = s[B].split(s[B].op.reduce_axis[0], 16)
+ BF = s.rfactor(B, ko, 0)
+
+ s.normalize()
+ bounds = tvm.schedule.InferBound(s)
+
+ stmt1 = tvm.schedule.ScheduleOps(s, bounds)
+ stmt1 = tvm.ir_pass.Simplify(stmt1)
+
+ stmt2 = tvm.ir_pass.LoopPartition(stmt1, True)
+ stmt2 = tvm.ir_pass.Simplify(stmt2)
+
+ #make sure loop partition actually did something
+ assert not tvm.ir_pass.Equal(stmt1.body, stmt2.body)
+
+
if __name__ == "__main__":
test_basic()
test_const_loop()
test_cce_loop_3()
test_conv_tiling()
test_double_splitting_with_indivisible_factors()
+ test_simple_rfactor()