--- /dev/null
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
+
+import tvm
+import numpy as np
+from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int32_vnni
+from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int32
+from nose.tools import nottest
+
+
+@nottest
+def test_fc_int8_acc32():
+ m = 1024
+ n = 1024
+ k = 1024
+
+ X = tvm.placeholder((m, k), name='X', dtype="uint8")
+ W = tvm.placeholder((n, k), name='W', dtype="int8")
+
+ peak = 280
+ print("Peak {} Gops/s".format(peak))
+ memory_ops = m * k + n * k + 2 * m * n
+ gops_per_mm = 2 * m * n * k
+
+ # For LLVM < 8.0, it shows "'cascadelake' is not a recognized processor for this target
+ # (ignoring processor)" error with the following setting. After LLVM 8.0 is enabled in the
+ # test, we should use cascadelake setting.
+ def verify(target="llvm -mcpu=cascadelake"):
+ if not tvm.module.enabled(target):
+ print("skip because %s is not enabled..." % target)
+ return
+
+ ctx = tvm.context(target, 0)
+ pc = dot_16x1x16_int8_int8_int32_vnni()
+ ak = tvm.reduce_axis((0, k), name='k')
+ packedW = tvm.placeholder(
+ (n // 16, 16 * (k // 4), 4), name='packedW', dtype="int8")
+
+ t_fc = tvm.compute((m, n), lambda i, j: tvm.sum(X[i, ak].astype(
+ "int32") * packedW[j / 16, (ak / 4) * 16 + j % 16, ak % 4].astype("int32"), axis=ak), name="F")
+ t_sch = tvm.create_schedule(t_fc.op)
+ a_x, a_y = t_fc.op.axis
+ a_k, = t_fc.op.reduce_axis
+
+ a_yo, a_yi = t_sch[t_fc].split(a_y, factor=16)
+ a_xo, a_xi = t_sch[t_fc].split(a_x, factor=32)
+ a_ko, a_ki = t_sch[t_fc].split(a_k, factor=4)
+ a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=4)
+ t_sch[t_fc].reorder(a_yo, a_xo, a_xi, a_koo, a_koi, a_yi, a_ki)
+
+ t_sch[t_fc].unroll(a_koi)
+ t_sch[t_fc].tensorize(a_yi, pc)
+
+ t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
+ t_evaluator = t_func.time_evaluator(t_func.entry_name, ctx, number=10)
+
+ # generate the plain data
+ a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
+ b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8")
+
+ packW = np.random.uniform(1, 10, size=(
+ n // 16, 16 * (k // 4), 4)).astype("int8")
+ # This occurs in pre_compute stage
+ for r_idx in range(n // 16):
+ for s_idx in range(16 * (k // 4)):
+ for t_idx in range(4):
+ packW[r_idx][s_idx][t_idx] = b_[r_idx * 16 + s_idx %
+ 16][(s_idx // 16) * 4 + t_idx]
+
+ x = tvm.nd.array(a_, ctx)
+ w = tvm.nd.array(packW, ctx)
+ y = tvm.nd.array(np.zeros((m, n), dtype="int32"), ctx)
+ result = t_evaluator(x, w, y)
+
+ gops_per_sec = gops_per_mm / result.mean / 1e9
+ # verify the correctness
+ tvm.testing.assert_allclose(y.asnumpy(), np.dot(a_, b_.T), rtol=0)
+ print('Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}'.format(
+ result.mean * 1000, gops_per_sec, gops_per_sec / peak))
+ t_func.export_library("tensorize_acc32.o")
+
+ verify()
+
+
+if __name__ == "__main__":
+ # The test requires Cascade Lake and newer Intel machines to generate the
+ # correct AVX512 VNNI instruction. So, disabling the test.
+
+ # test_fc_int8_acc32()
+ pass
def dot_16x1x16_int8_int8_int32():
"""
- Int8 dot product by every 4 elements using AVX2 Skylake instructions.
+ Int8 dot product by every 4 elements using AVX512 Skylake instructions.
This function takes two arrays of int8 datatype -- data[4] and
kernel[16][4] -- and computes a dot product of data[4] with every
4 elements of kernels, resulting in output[16] of int32 datatype.
void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
int32 output[16]){
for (int i = 0; i < 16; i++){
- out[i] = 0;
+ output[i] = 0;
for (int k = 0; k < 4; k++){
- out[i] += data[k] * kernel[i][k]
+ output[i] += data[k] * kernel[i][k]
}
}
}
def dot_16x1x16_int8_int8_int16():
"""
- Int8 dot product by every 2 elements using AVX2 Skylake instructions.
+ Int8 dot product by every 2 elements using AVX512 Skylake instructions.
This function takes two arrays of int8 datatype -- data[2] and
kernel[4][32][2] -- and computes a dot product of data[2] with every
2 elements of kernels, resulting in output[4][32] of int16 datatype.
.. code-block:: c
void dot_16x1x16_int8_int8_int16(int8 data[2], int8 kernel[32*4][2],
int16 output[32*4]){
- for (int i = 0; i< 4; i++){
+ for (int i = 0; i< 4; i++){
for (int j = 0; j < 32; j++){
- out[i][i] = 0;
+ output[i][i] = 0;
for (int k = 0; k < 2; k++){
- out[i][j][k] += data[k] * kernel[i][j][k]
+ output[i][j][k] += data[k] * kernel[i][j][k]
}
}
+ }
}
- }
+
Physically, the kernel array sits in four AVX512 vector registers and
the data[2] is broadcasted to another AVX512 vector register. This
function returns a TensorIntrin that can be used to tensorize
a schedule.
+
Returns
-------
intrin : TensorIntrin
The Skylake int8 TensorIntrin that can be used in tensorizing schedule
"""
- num_int8_elements = 2 # 2 int8 elements in int32
+ int16_lanes = 4*32 # 4*32 int32 lanes in 4 AVX512 vector registers
+ num_int8_elements = 2 # 2 int8 elements in int16
data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
- kernel = tvm.placeholder((128, num_int8_elements), dtype='int8', name='kernel')
+ kernel = tvm.placeholder((int16_lanes, num_int8_elements), dtype='int8', name='kernel')
k = tvm.reduce_axis((0, num_int8_elements), name='k')
- C = tvm.compute((128, ),
+ C = tvm.compute((int16_lanes, ),
lambda i: tvm.sum(data[k].astype('int16') *
kernel[i, k].astype('int16'),
axis=k),
with tvm.build_config(offset_factor=1, partition_const_loop=True):
return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
+
+
+def dot_16x1x16_int8_int8_int32_vnni():
+ """
+ Int8 dot product by every 4 elements using AVX512VNNI Cascade Lake instructions.
+ This function takes two arrays of int8 datatype -- data[4] and
+ kernel[16][4] -- and computes a dot product of data[4] with every
+ 4 elements of kernels, resulting in output[16] of int32 datatype.
+ The pseudo code is as follows.
+ .. code-block:: c
+ void dot_16x1x16_int8_int8_int32_vnni(int8 data[4], int8 kernel[16][4],
+ int32 output[16]){
+ for (int i = 0; i < 16; i++){
+ output[i] = 0;
+ for (int k = 0; k < 4; k++){
+ output[i] += data[k] * kernel[i][k]
+ }
+ }
+ }
+
+ Physically, the kernel array sits in an AVX512 vector register and
+ the data[4] is broadcasted to another AVX512 vector register. This
+ function returns a TensorIntrin that can be used to tensorize
+ a schedule.
+
+ Returns
+ -------
+ intrin : TensorIntrin
+ The Cascade Lake int8 TensorIntrin that can be used in tensorizing schedule
+ """
+
+ int32_lanes = 16 # 16 int32 lanes in AVX512
+ num_int8_elements = 4 # 4 int8 elements in int32
+ data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
+ kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel')
+ k = tvm.reduce_axis((0, num_int8_elements), name='k')
+ C = tvm.compute((int32_lanes,),
+ lambda i: tvm.sum(data[k].astype('int32') *
+ kernel[i, k].astype('int32'),
+ axis=k),
+ name="C")
+
+ a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
+ offset_factor=1,
+ strides=[1])
+ b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
+ offset_factor=1,
+ strides=[tvm.var('ldw'), 1])
+
+ def _intrin_func(ins, outs):
+ def _instr(index):
+ ib = tvm.ir_builder.create()
+ if index == 1:
+ ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
+ return ib.get()
+
+ a_int8 = ins[0].vload([0], "uint8x4")
+ re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
+ vec_ai32 = re_int32.astype('int32x16')
+ vec_b = ins[1].vload([0, 0], "int8x64")
+
+ vnni_inst_name = 'llvm.x86.avx512.vpdpbusd.512'
+ llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(vnni_inst_name)
+
+ if llvm_id != 0: # VNNI is available for current LLVM version
+ vec_bi32 = tvm.call_pure_intrin('int32x16', 'reinterpret', vec_b)
+ vec_zero = tvm.const(0, "int32x16")
+ quad_reduction = tvm.call_llvm_intrin('int32x16',
+ 'llvm.x86.avx512.vpdpbusd.512',
+ tvm.const(0, 'uint32'),
+ vec_zero,
+ vec_ai32, vec_bi32)
+ else: # Fall back to the normal AVX512
+ vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
+ vec_one = tvm.const(1, "int16x32")
+ pair_reduction = tvm.call_llvm_intrin('int16x32',
+ 'llvm.x86.avx512.pmaddubs.w.512',
+ tvm.const(0, 'uint32'),
+ vec_a, vec_b)
+ quad_reduction = tvm.call_llvm_intrin('int32x16',
+ 'llvm.x86.avx512.pmaddw.d.512',
+ tvm.const(0, 'uint32'),
+ pair_reduction, vec_one)
+
+ if index == 0:
+ ib.emit(outs[0].vstore(0, quad_reduction))
+ else:
+ ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16')))
+ return ib.get()
+
+ # body, reset, update
+ return _instr(0), _instr(1), _instr(2)
+
+ with tvm.build_config(offset_factor=1, partition_const_loop=True):
+ return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})