From: Thomas Lively <tlively@google.com>
Date: Mon, 11 May 2020 16:55:43 +0000 (-0700)
Subject: [WebAssembly] Add wasm-specific vector shuffle builtin and intrinsic
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8e3e56f2a36701480eeb65e426701d5a9025cc59;p=platform%2Fupstream%2Fllvm.git

[WebAssembly] Add wasm-specific vector shuffle builtin and intrinsic

Summary:

Although using `__builtin_shufflevector` and the `shufflevector`
instruction works fine, they are not opaque to the optimizer. As a
result, DAGCombine can potentially reduce the number of shuffles and
change the shuffle masks. This is unexpected behavior for users of the
WebAssembly SIMD intrinsics who have crafted their shuffles to
optimize the code generated by engines. This patch solves the problem
by adding a new shuffle intrinsic that is opaque to the optimizers in
line with the decision of the WebAssembly SIMD contributors at
https://github.com/WebAssembly/simd/issues/196#issuecomment-622494748. In
the future we may implement custom DAG combines to properly optimize
shuffles and replace this solution.

Reviewers: aheejin, dschuff

Subscribers: sbc100, jgravelle-google, hiraditya, sunfish, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D66983
---

diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index cbdb33c..3c7e275 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -119,6 +119,7 @@ TARGET_BUILTIN(__builtin_wasm_avgr_u_i8x16, "V16cV16cV16c", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_avgr_u_i16x8, "V8sV8sV8s", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_bitselect, "V4iV4iV4iV4i", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_shuffle_v8x16, "V16cV16cV16cIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIi", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_any_true_i8x16, "iV16c", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_any_true_i16x8, "iV8s", "nc", "simd128")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c64fde7..541dac7 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16044,6 +16044,20 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Vec->getType()});
     return Builder.CreateCall(Callee, Vec);
   }
+  case WebAssembly::BI__builtin_wasm_shuffle_v8x16: {
+    Value *Ops[18];
+    size_t OpIdx = 0;
+    Ops[OpIdx++] = EmitScalarExpr(E->getArg(0));
+    Ops[OpIdx++] = EmitScalarExpr(E->getArg(1));
+    while (OpIdx < 18) {
+      llvm::APSInt LaneConst;
+      if (!E->getArg(OpIdx)->isIntegerConstantExpr(LaneConst, getContext()))
+        llvm_unreachable("Constant arg isn't actually constant?");
+      Ops[OpIdx++] = llvm::ConstantInt::get(getLLVMContext(), LaneConst);
+    }
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle);
+    return Builder.CreateCall(Callee, Ops);
+  }
   default:
     return nullptr;
   }
diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h
index 612aec1..b81594e 100644
--- a/clang/lib/Headers/wasm_simd128.h
+++ b/clang/lib/Headers/wasm_simd128.h
@@ -1020,23 +1020,31 @@ wasm_f32x4_convert_u32x4(v128_t __a) {
 #define wasm_v8x16_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
                            __c7, __c8, __c9, __c10, __c11, __c12, __c13,       \
                            __c14, __c15)                                       \
-  ((v128_t)(__builtin_shufflevector(                                           \
-      (__u8x16)(__a), (__u8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5,      \
-      __c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0, __c1, __c2, __c3, __c4, __c5,      \
+      __c6, __c7, __c8, __c9, __c10, __c11, __c12, __c13, __c14, __c15))
 
 #define wasm_v16x8_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
                            __c7)                                               \
-  ((v128_t)(__builtin_shufflevector((__u16x8)(__a), (__u16x8)(__b), __c0,      \
-                                    __c1, __c2, __c3, __c4, __c5, __c6,        \
-                                    __c7)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0 * 2, __c0 * 2 + 1, __c1 * 2,        \
+      __c1 * 2 + 1, __c2 * 2, __c2 * 2 + 1, __c3 * 2, __c3 * 2 + 1, __c4 * 2,  \
+      __c4 * 2 + 1, __c5 * 2, __c5 * 2 + 1, __c6 * 2, __c6 * 2 + 1, __c7 * 2,  \
+      __c7 * 2 + 1))
 
 #define wasm_v32x4_shuffle(__a, __b, __c0, __c1, __c2, __c3)                   \
-  ((v128_t)(__builtin_shufflevector((__u32x4)(__a), (__u32x4)(__b), __c0,      \
-                                    __c1, __c2, __c3)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0 * 4, __c0 * 4 + 1, __c0 * 4 + 2,    \
+      __c0 * 4 + 3, __c1 * 4, __c1 * 4 + 1, __c1 * 4 + 2, __c1 * 4 + 3,        \
+      __c2 * 4, __c2 * 4 + 1, __c2 * 4 + 2, __c2 * 4 + 3, __c3 * 4,            \
+      __c3 * 4 + 1, __c3 * 4 + 2, __c3 * 4 + 3))
 
 #define wasm_v64x2_shuffle(__a, __b, __c0, __c1)                               \
-  ((v128_t)(                                                                   \
-      __builtin_shufflevector((__u64x2)(__a), (__u64x2)(__b), __c0, __c1)))
+  ((v128_t)__builtin_wasm_shuffle_v8x16(                                       \
+      (__i8x16)(__a), (__i8x16)(__b), __c0 * 8, __c0 * 8 + 1, __c0 * 8 + 2,    \
+      __c0 * 8 + 3, __c0 * 8 + 4, __c0 * 8 + 5, __c0 * 8 + 6, __c0 * 8 + 7,    \
+      __c1 * 8, __c1 * 8 + 1, __c1 * 8 + 2, __c1 * 8 + 3, __c1 * 8 + 4,        \
+      __c1 * 8 + 5, __c1 * 8 + 6, __c1 * 8 + 7))
 
 #ifdef __wasm_unimplemented_simd128__
 
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 729a677..5f50861 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -724,5 +724,14 @@ i32x4 widen_high_u_i32x4_i16x8(i16x8 v) {
 i8x16 swizzle_v8x16(i8x16 x, i8x16 y) {
   return __builtin_wasm_swizzle_v8x16(x, y);
   // WEBASSEMBLY: call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y)
+}
+
+i8x16 shuffle(i8x16 x, i8x16 y) {
+  return __builtin_wasm_shuffle_v8x16(x, y, 0, 1, 2, 3, 4, 5, 6, 7,
+                                      8, 9, 10, 11, 12, 13, 14, 15);
+  // WEBASSEMBLY: call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
+  // WEBASSEMBLY-SAME: i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+  // WEBASSEMBLY-SAME: i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14,
+  // WEBASSEMBLY-SAME: i32 15
   // WEBASSEMBLY-NEXT: ret
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 90b5a25..a95740c 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -104,6 +104,13 @@ def int_wasm_swizzle :
   Intrinsic<[llvm_v16i8_ty],
             [llvm_v16i8_ty, llvm_v16i8_ty],
             [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_shuffle :
+  Intrinsic<[llvm_v16i8_ty],
+            [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_sub_saturate_signed :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>],
@@ -116,7 +123,6 @@ def int_wasm_avgr_unsigned :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>],
             [IntrNoMem, IntrSpeculatable]>;
-
 def int_wasm_bitselect :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
@@ -170,7 +176,6 @@ def int_wasm_widen_high_unsigned :
             [llvm_anyvector_ty],
             [IntrNoMem, IntrSpeculatable]>;
 
-
 //===----------------------------------------------------------------------===//
 // Bulk memory intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 303b33e..4a45858 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1353,6 +1353,24 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
                            Op.getOperand(3)  // thrown value
                        });
   }
+
+  case Intrinsic::wasm_shuffle: {
+    // Drop in-chain and replace undefs, but otherwise pass through unchanged
+    SDValue Ops[18];
+    size_t OpIdx = 0;
+    Ops[OpIdx++] = Op.getOperand(1);
+    Ops[OpIdx++] = Op.getOperand(2);
+    while (OpIdx < 18) {
+      const SDValue &MaskIdx = Op.getOperand(OpIdx + 1);
+      if (MaskIdx.isUndef() ||
+          cast<ConstantSDNode>(MaskIdx.getNode())->getZExtValue() >= 32) {
+        Ops[OpIdx++] = DAG.getConstant(0, DL, MVT::i32);
+      } else {
+        Ops[OpIdx++] = MaskIdx;
+      }
+    }
+    return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops);
+  }
   }
 }
 
diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
index 6e0fadc..c827fb7 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -141,6 +141,36 @@ define <16 x i8> @narrow_unsigned_v16i8(<8 x i16> %low, <8 x i16> %high) {
   ret <16 x i8> %a
 }
 
+; CHECK-LABEL: shuffle_v16i8:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
+; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <16 x i8> @llvm.wasm.shuffle(
+  <16 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
+  i32, i32, i32, i32, i32)
+define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
+      i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 35)
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: shuffle_undef_v16i8:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $1,
+; SIMD128-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y,
+      i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
+      i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
+      i32 undef, i32 undef, i32 undef, i32 2)
+  ret <16 x i8> %res
+}
+
 ; ==============================================================================
 ; 8 x i16
 ; ==============================================================================