[TOPI][x86] Cascade lake support. (#4123)

author Animesh Jain <anijain@umich.edu>

Thu, 17 Oct 2019 16:31:58 +0000 (09:31 -0700)

committer Thierry Moreau <moreau@uw.edu>

Thu, 17 Oct 2019 16:31:58 +0000 (09:31 -0700)
author Animesh Jain <anijain@umich.edu>
Thu, 17 Oct 2019 16:31:58 +0000 (09:31 -0700)
committer Thierry Moreau <moreau@uw.edu>
Thu, 17 Oct 2019 16:31:58 +0000 (09:31 -0700)
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py

index 0fdc0f3..6b2e073 100644 (file)
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -100,7 +100,7 @@ def _qnn_conv2d_legalize(attrs, inputs, types):
          Checks to ensure that we can use Intel DLBoost instructions - Check if the target is skylake
          and above.
          """
-        supported_arches = {'-mcpu=skylake-avx512',}
+        supported_arches = {'-mcpu=skylake-avx512', '-mcpu=cascadelake'}
          return supported_arches.intersection(set(target.options))
  
      # Collect the dtypes.
diff --git a/python/tvm/target.py b/python/tvm/target.py

index 4548ffa..42045c0 100644 (file)
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -128,6 +128,16 @@ class Target(NodeBase):
                  return opt.value[7:]
          return 'unknown'
  
+    @property
+    def mcpu(self):
+        """Returns the mcpu from the target if it exists."""
+        mcpu = ''
+        if self.options is not None:
+            for opt in self.options:
+                if 'mcpu' in opt:
+                    mcpu = opt.split('=')[1]
+        return mcpu
+
      def __enter__(self):
          _api_internal._EnterTargetScope(self)
          return self
diff --git a/tests/python/contrib/test_gemm_acc16.py b/tests/python/contrib/test_gemm_acc16.py

index 5551878..17f920e 100644 (file)
--- a/tests/python/contrib/test_gemm_acc16.py
+++ b/tests/python/contrib/test_gemm_acc16.py
@@ -17,7 +17,7 @@
  # pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
  import tvm
  import numpy as np
-from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int16
+from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int16
  
  
  def benchmark_fc_int8_acc16():
@@ -40,7 +40,7 @@ def benchmark_fc_int8_acc16():
          ctx = tvm.context(target, 0)
          X = tvm.placeholder((m, k), name='X', dtype="uint8")
          W = tvm.placeholder((n, k), name='W', dtype="int8")
-        pc = dot_16x1x16_int8_int8_int16()
+        pc = dot_16x1x16_uint8_int8_int16()
          ak = tvm.reduce_axis((0, k), name='k')
  
          packedW = tvm.placeholder((n//128, 128*(k//2), 2), name='packedW', dtype="int8")
diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py

index 34518f4..4f53591 100644 (file)
--- a/tests/python/contrib/test_gemm_acc32_vnni.py
+++ b/tests/python/contrib/test_gemm_acc32_vnni.py
@@ -18,8 +18,8 @@
  
  import tvm
  import numpy as np
-from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int32_vnni
-from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int32
+from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
+from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32
  import pytest
  
  
@@ -46,7 +46,7 @@ def test_fc_int8_acc32():
              return
  
          ctx = tvm.context(target, 0)
-        pc = dot_16x1x16_int8_int8_int32_vnni()
+        pc = dot_16x1x16_uint8_int8_int32_cascadelake()
          ak = tvm.reduce_axis((0, k), name='k')
          packedW = tvm.placeholder(
              (n // 16, 16 * (k // 4), 4), name='packedW', dtype="int8")
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py

index 0155824..e097980 100644 (file)
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -576,57 +576,71 @@ def test_conv2d_int8_intrinsics():
          assembly = lib.get_source("asm")
          return assembly
  
-    # compile conv2d for x86 (skylake) and test assembly contains *pmadd* instructions
-    target = "llvm -mcpu=skylake-avx512"
-    name = "llvm.x86.avx512.pmaddubs.w.512"
-    llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(name)
-    if llvm_id != 0:
-        fast_int8_dtypes = ('uint8', 'int8', 'int32')
-        # Sweep the input channels to check int8 robustness
-        for ic in range(1, 24):
-            asm = _compile(ic=ic, oc=32, target=target, data_layout="NCHW", kernel_layout='OIHW',
-                           dtypes=fast_int8_dtypes)
-            assert "pmaddubs" in asm
-
-        for ic in range(1, 24):
-            asm = _compile(ic=ic, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                           dtypes=fast_int8_dtypes)
-            assert "pmaddubs" in asm
-
-
-        # Sweep the output channels to check int8 robustness
-        for oc in range(2, 24):
-            asm = _compile(ic=16, oc=oc, target=target, data_layout="NCHW", kernel_layout='OIHW',
+    def _has_fast_int8_instructions(asm, target):
+        if 'skylake-avx512' in target:
+            return "pmaddubs" in asm
+        elif 'cascadelake' in target:
+            return "vpdpbusd" in asm
+        else:
+            assert False, "Target should be Skylake or Cascadelake"
+
+    # compile conv2d for x86 (skylake, cascadelake) and test assembly contains *pmadd* instructions
+    targets = ["llvm -mcpu=skylake-avx512", "llvm -mcpu=cascadelake"]
+    llvm_version = tvm.codegen.llvm_version_major()
+    for target in targets:
+        if llvm_version >= 8:
+            fast_int8_dtypes = ('uint8', 'int8', 'int32')
+            # Sweep the input channels to check int8 robustness
+            # Input channels should be a multiple of 4 internally.
+            for ic in [1, 4, 6]:
+                asm = _compile(ic=ic, oc=32, target=target, data_layout="NCHW",
+                               kernel_layout='OIHW',
+                               dtypes=fast_int8_dtypes)
+                assert _has_fast_int8_instructions(asm, target)
+
+            for ic in [1, 4, 6]:
+                asm = _compile(ic=ic, oc=32, target=target, data_layout="NHWC",
+                               kernel_layout='HWIO',
+                               dtypes=fast_int8_dtypes)
+                assert _has_fast_int8_instructions(asm, target)
+
+
+            # Sweep the output channels to check int8 robustness
+            # Output channels should be a multiple of 16 internally.
+            for oc in [4, 16, 20]:
+                asm = _compile(ic=16, oc=oc, target=target, data_layout="NCHW",
+                               kernel_layout='OIHW',
+                               dtypes=fast_int8_dtypes)
+                assert _has_fast_int8_instructions(asm, target)
+
+            for oc in [4, 16, 20]:
+                asm = _compile(ic=16, oc=oc, target=target, data_layout="NHWC",
+                               kernel_layout='HWIO',
+                               dtypes=fast_int8_dtypes)
+                assert _has_fast_int8_instructions(asm, target)
+
+            # Check that both non-divisible oc and ic work
+            asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW',
                             dtypes=fast_int8_dtypes)
-            assert "pmaddubs" in asm
+            assert _has_fast_int8_instructions(asm, target)
  
-        for oc in range(2, 24):
-            asm = _compile(ic=16, oc=oc, target=target, data_layout="NHWC", kernel_layout='HWIO',
+            asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
                             dtypes=fast_int8_dtypes)
-            assert "pmaddubs" in asm
-
-        # Check that both non-divisible oc and ic work
-        asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW',
-                       dtypes=fast_int8_dtypes)
-        assert "pmaddubs" in asm
-
-        asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                       dtypes=fast_int8_dtypes)
-        assert "pmaddubs" in asm
-
-        # Ensure that code is generated when datatypes are not HW supported.
-        dtypes = ('int8', 'int8', 'int32')
-        asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                       dtypes=dtypes)
-        # Check that intrinisic is not present in the assembly.
-        assert "pmaddubs" not in asm
-
-        # Ensure that code is generated when datatypes are not HW supported.
-        dtypes = ('uint8', 'uint8', 'int32')
-        asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                       dtypes=dtypes)
-        # Check that intrinisic is not present in the assembly.
-        assert "pmaddubs" not in asm
+            assert _has_fast_int8_instructions(asm, target)
+
+            # Ensure that code is generated when datatypes are not HW supported.
+            dtypes = ('int8', 'int8', 'int32')
+            asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
+                           dtypes=dtypes)
+            # Check that intrinisic is not present in the assembly.
+            assert not _has_fast_int8_instructions(asm, target)
+
+            # Ensure that code is generated when datatypes are not HW supported.
+            dtypes = ('uint8', 'uint8', 'int32')
+            asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
+                           dtypes=dtypes)
+            # Check that intrinisic is not present in the assembly.
+            assert not _has_fast_int8_instructions(asm, target)
  
      # Check that a vectorized instruction is generated for older Intel
      # generations, because we default to NCHWc layout.
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py

index 96b6e47..2a81dcc 100644 (file)
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -24,7 +24,7 @@ from ..nn.pad import pad
  from ..nn.util import infer_pad, get_pad_tuple
  from ..generic import conv2d as conv2d_generic
  from ..util import get_const_tuple, simplify
-from .tensor_intrin import dot_16x1x16_int8_int8_int32
+from .tensor_intrin import dot_16x1x16_uint8_int8_int32
  from .util import get_fp32_len
  
  def _fallback_schedule(cfg, wkl):
@@ -183,7 +183,7 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
  def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
      return conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data, conv_out, last,
                                                             int32_lanes=16,
-                                                           intrin=dot_16x1x16_int8_int8_int32())
+                                                           intrin=dot_16x1x16_uint8_int8_int32())
  
  
  def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, out_dtype):
@@ -282,7 +282,7 @@ def _schedule_conv_nhwc_pack_int8(s, cfg, data, conv_out, last):
      ic_f_outer, ic_s_outer = s[C].split(ic_outer, factor=ic_factor)
      s[C].reorder(oc_outer, oh, ow, ic_f_outer, ic_s_outer, kh, kw, oc_inner, ic_inner)
  
-    pc = dot_16x1x16_int8_int8_int32()
+    pc = dot_16x1x16_uint8_int8_int32()
      s[C].tensorize(oc_inner, pc)
  
      if C != O:
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py

index 53b79bd..7c5096d 100644 (file)
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -23,7 +23,7 @@ from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
  from ..nn.util import infer_pad
  from ..generic import conv2d as conv2d_generic
  from ..util import get_const_tuple
-from .tensor_intrin import dot_16x1x16_int8_int8_int32
+from .tensor_intrin import dot_16x1x16_uint8_int8_int32
  from .util import get_fp32_len
  
  def _fallback_schedule(cfg, wkl):
@@ -209,4 +209,4 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
  def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
      return conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(s, cfg, data, conv_out, last,
                                                                int32_lanes=16,
-                                                              intrin=dot_16x1x16_int8_int8_int32())
+                                                              intrin=dot_16x1x16_uint8_int8_int32())
diff --git a/topi/python/topi/x86/conv2d_int8.py b/topi/python/topi/x86/conv2d_int8.py

index f701108..df53850 100644 (file)
--- a/topi/python/topi/x86/conv2d_int8.py
+++ b/topi/python/topi/x86/conv2d_int8.py
@@ -57,16 +57,14 @@ def _is_int8_hw_support(data_dtype, kernel_dtype):
      is_dtype_support = data_dtype == 'uint8' and kernel_dtype == 'int8'
  
      # 2) Check LLVM support
-    llvm_intrin_fast_int8 = "llvm.x86.avx512.pmaddubs.w.512"
-    llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(llvm_intrin_fast_int8)
-    is_llvm_support = llvm_id != 0
+    llvm_version = tvm.codegen.llvm_version_major()
+    is_llvm_support = llvm_version >= 8
  
      # 3) Check target
-    target = tvm.target.current_target()
+    mcpu = tvm.target.current_target().mcpu
      is_target_support = False
-    for opt in target.options:
-        if opt == '-mcpu=skylake-avx512':
-            is_target_support = True
+    if mcpu == 'skylake-avx512' or mcpu == 'cascadelake':
+        is_target_support = True
  
      return is_dtype_support and is_llvm_support and is_target_support
  
diff --git a/topi/python/topi/x86/tensor_intrin.py b/topi/python/topi/x86/tensor_intrin.py

index cba00c0..a8ad251 100644 (file)
--- a/topi/python/topi/x86/tensor_intrin.py
+++ b/topi/python/topi/x86/tensor_intrin.py
@@ -19,15 +19,27 @@
  import tvm
  
  
-def dot_16x1x16_int8_int8_int32():
+def dot_16x1x16_uint8_int8_int32():
+    """Dispatch the most optimized intrin depending on the target"""
+    mcpu = tvm.target.current_target().mcpu
+
+    assert mcpu in ("skylake-avx512", "cascadelake"), \
+            "An old Intel machine that does not have fast Int8 support."
+    if mcpu == "skylake-avx512":
+        return dot_16x1x16_uint8_int8_int32_skylake()
+    # cascadelake
+    return dot_16x1x16_uint8_int8_int32_cascadelake()
+
+
+def dot_16x1x16_uint8_int8_int32_skylake():
      """
      Int8 dot product by every 4 elements using AVX512 Skylake instructions.
-    This function takes two arrays of int8 datatype -- data[4] and
+    This function takes two arrays of uint8 and int8 datatype -- data[4] and
      kernel[16][4] -- and computes a dot product of data[4] with every
      4 elements of kernels, resulting in output[16] of int32 datatype.
      The pseudo code is as follows.
      .. code-block:: c
-        void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
+        void dot_16x1x16_uint8_int8_int32(uint8 data[4], int8 kernel[16][4],
                  int32 output[16]){
              for (int i = 0; i < 16; i++){
                  output[i] = 0;
@@ -100,15 +112,15 @@ def dot_16x1x16_int8_int8_int32():
          return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
  
  
-def dot_16x1x16_int8_int8_int16():
+def dot_16x1x16_uint8_int8_int16():
      """
      Int8 dot product by every 2 elements using AVX512 Skylake instructions.
-    This function takes two arrays of int8 datatype -- data[2] and
+    This function takes two arrays of uint8 and int8 datatype -- data[2] and
      kernel[4][32][2] -- and computes a dot product of data[2] with every
      2 elements of kernels, resulting in output[4][32] of int16 datatype.
      The pseudo code is as follows.
      .. code-block:: c
-        void dot_16x1x16_int8_int8_int16(int8 data[2], int8 kernel[32*4][2],
+        void dot_16x1x16_uint8_int8_int16(uint8 data[2], int8 kernel[32*4][2],
                  int16 output[32*4]){
              for (int i = 0; i< 4; i++){
                  for (int j = 0; j < 32; j++){
@@ -182,15 +194,15 @@ def dot_16x1x16_int8_int8_int16():
          return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
  
  
-def dot_16x1x16_int8_int8_int32_vnni():
+def dot_16x1x16_uint8_int8_int32_cascadelake():
      """
      Int8 dot product by every 4 elements using AVX512VNNI Cascade Lake instructions.
-    This function takes two arrays of int8 datatype -- data[4] and
+    This function takes two arrays of uint8 and int8 datatype -- data[4] and
      kernel[16][4] -- and computes a dot product of data[4] with every
      4 elements of kernels, resulting in output[16] of int32 datatype.
      The pseudo code is as follows.
      .. code-block:: c
-        void dot_16x1x16_int8_int8_int32_vnni(int8 data[4], int8 kernel[16][4],
+        void dot_16x1x16_uint8_int8_int32_cascadelake(uint8 data[4], int8 kernel[16][4],
                  int32 output[16]){
              for (int i = 0; i < 16; i++){
                  output[i] = 0;
diff --git a/topi/python/topi/x86/util.py b/topi/python/topi/x86/util.py

index f0b3c75..00f297e 100644 (file)
--- a/topi/python/topi/x86/util.py
+++ b/topi/python/topi/x86/util.py
@@ -19,10 +19,8 @@ from __future__ import absolute_import as _abs
  import tvm
  
  def get_fp32_len():
+    mcpu = tvm.target.current_target().mcpu
      fp32_vec_len = 8
-    target = tvm.target.current_target()
-    if target is not None:
-        for opt in target.options:
-            if opt == '-mcpu=skylake-avx512':
-                fp32_vec_len = 16
+    if mcpu == 'skylake-avx512' or mcpu == 'cascadelake':
+        fp32_vec_len = 16
      return fp32_vec_len
author	Animesh Jain <anijain@umich.edu>
	Thu, 17 Oct 2019 16:31:58 +0000 (09:31 -0700)
committer	Thierry Moreau <moreau@uw.edu>
	Thu, 17 Oct 2019 16:31:58 +0000 (09:31 -0700)
python/tvm/relay/qnn/op/legalizations.py		patch \| blob \| history
python/tvm/target.py		patch \| blob \| history
tests/python/contrib/test_gemm_acc16.py		patch \| blob \| history
tests/python/contrib/test_gemm_acc32_vnni.py		patch \| blob \| history
tests/python/relay/test_op_level2.py		patch \| blob \| history
topi/python/topi/x86/conv2d_avx_1x1.py		patch \| blob \| history
topi/python/topi/x86/conv2d_avx_common.py		patch \| blob \| history
topi/python/topi/x86/conv2d_int8.py		patch \| blob \| history
topi/python/topi/x86/tensor_intrin.py		patch \| blob \| history
topi/python/topi/x86/util.py		patch \| blob \| history