[x86][CGP] try to hoist funnel shift above select-of-splats

author Sanjay Patel <spatel@rotateright.com>

Fri, 15 May 2020 19:22:30 +0000 (15:22 -0400)

committer Sanjay Patel <spatel@rotateright.com>

Sat, 16 May 2020 14:44:47 +0000 (10:44 -0400)
author Sanjay Patel <spatel@rotateright.com>
Fri, 15 May 2020 19:22:30 +0000 (15:22 -0400)
committer Sanjay Patel <spatel@rotateright.com>
Sat, 16 May 2020 14:44:47 +0000 (10:44 -0400)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp

index 8723e26..1c547ab 100644 (file)
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -391,6 +391,7 @@ class TypePromotionTransaction;
      bool optimizeExtUses(Instruction *I);
      bool optimizeLoadExt(LoadInst *Load);
      bool optimizeShiftInst(BinaryOperator *BO);
+    bool optimizeFunnelShift(IntrinsicInst *Fsh);
      bool optimizeSelectInst(SelectInst *SI);
      bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
      bool optimizeSwitchInst(SwitchInst *SI);
@@ -2061,6 +2062,9 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
      case Intrinsic::ctlz:
        // If counting zeros is expensive, try to avoid it.
        return despeculateCountZeros(II, TLI, DL, ModifiedDT);
+    case Intrinsic::fshl:
+    case Intrinsic::fshr:
+      return optimizeFunnelShift(II);
      case Intrinsic::dbg_value:
        return fixupDbgValue(II);
      case Intrinsic::vscale: {
@@ -6240,6 +6244,41 @@ bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
    return true;
  }
  
+bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {
+  Intrinsic::ID Opcode = Fsh->getIntrinsicID();
+  assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) &&
+         "Expected a funnel shift");
+
+  // If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper
+  // than general vector shifts, and (3) the shift amount is select-of-splatted
+  // values, hoist the funnel shifts before the select:
+  //   fsh Op0, Op1, (select Cond, TVal, FVal) -->
+  //   select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal)
+  //
+  // This is inverting a generic IR transform when we know that the cost of a
+  // general vector shift is more than the cost of 2 shift-by-scalars.
+  // We can't do this effectively in SDAG because we may not be able to
+  // determine if the select operands are splats from within a basic block.
+  Type *Ty = Fsh->getType();
+  if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
+    return false;
+  Value *Cond, *TVal, *FVal;
+  if (!match(Fsh->getOperand(2),
+             m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
+    return false;
+  if (!isSplatValue(TVal) || !isSplatValue(FVal))
+    return false;
+
+  IRBuilder<> Builder(Fsh);
+  Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1);
+  Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, TVal });
+  Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, FVal });
+  Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
+  Fsh->replaceAllUsesWith(NewSel);
+  Fsh->eraseFromParent();
+  return true;
+}
+
  /// If we have a SelectInst that will likely profit from branch prediction,
  /// turn it into a branch.
  bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll

index a1b7d6c..4903104 100644 (file)
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1695,58 +1695,48 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
  define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
  ; AVX1-LABEL: fancierRotate2:
  ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovd %edx, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm9
-; AVX1-NEXT:    vmovd %ecx, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm10
+; AVX1-NEXT:    vmovd %edx, %xmm1
+; AVX1-NEXT:    vmovd %ecx, %xmm3
  ; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
  ; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm11
-; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm12 = [31,31,31,31]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
+; AVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm11 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
  ; AVX1-NEXT:    .p2align 4, 0x90
  ; AVX1-NEXT:  .LBB8_1: # %loop
  ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vmovq {{.*#+}} xmm7 = mem[0],zero
-; AVX1-NEXT:    vpcmpeqb %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vpmovsxbd %xmm7, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,3]
-; AVX1-NEXT:    vpmovsxbd %xmm7, %xmm7
-; AVX1-NEXT:    vblendvps %xmm7, %xmm11, %xmm4, %xmm7
-; AVX1-NEXT:    vmovdqu 4096(%rdi,%rax,4), %xmm0
-; AVX1-NEXT:    vmovdqu 4112(%rdi,%rax,4), %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; AVX1-NEXT:    vandps %xmm7, %xmm12, %xmm7
-; AVX1-NEXT:    vpslld $23, %xmm7, %xmm7
-; AVX1-NEXT:    vpaddd %xmm6, %xmm7, %xmm7
-; AVX1-NEXT:    vcvttps2dq %xmm7, %xmm7
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; AVX1-NEXT:    vpmuludq %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vblendvps %xmm2, %xmm9, %xmm10, %xmm2
-; AVX1-NEXT:    vandps %xmm2, %xmm12, %xmm2
-; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; AVX1-NEXT:    vpmuludq %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqu %xmm0, 4096(%rdi,%rax,4)
-; AVX1-NEXT:    vmovdqu %xmm1, 4112(%rdi,%rax,4)
+; AVX1-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm8, %xmm5
+; AVX1-NEXT:    vpmovsxbd %xmm5, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; AVX1-NEXT:    vpmovsxbd %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqu 4096(%rdi,%rax,4), %xmm7
+; AVX1-NEXT:    vmovdqu 4112(%rdi,%rax,4), %xmm0
+; AVX1-NEXT:    vpslld %xmm9, %xmm7, %xmm1
+; AVX1-NEXT:    vpsrld %xmm10, %xmm7, %xmm2
+; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpslld %xmm9, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrld %xmm10, %xmm0, %xmm3
+; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpslld %xmm11, %xmm7, %xmm3
+; AVX1-NEXT:    vpsrld %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vblendvps %xmm6, %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpslld %xmm11, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vblendvps %xmm5, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovups %xmm1, 4096(%rdi,%rax,4)
+; AVX1-NEXT:    vmovups %xmm0, 4112(%rdi,%rax,4)
  ; AVX1-NEXT:    addq $8, %rax
  ; AVX1-NEXT:    jne .LBB8_1
  ; AVX1-NEXT:  # %bb.2: # %exit
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift.ll b/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift.ll

index ba2fef2..b1e7aee 100644 (file)
--- a/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift.ll
@@ -265,32 +265,113 @@ exit:
  }
  
  ; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426
+; If we don't have real vector shift instructions (AVX1), convert the funnel
+; shift into 2 funnel shifts and sink the splat shuffles into the loop.
  
  define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
-; ALL-LABEL: @fancierRotate2(
-; ALL-NEXT:  entry:
-; ALL-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
-; ALL-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
-; ALL-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
-; ALL-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
-; ALL-NEXT:    br label [[LOOP:%.*]]
-; ALL:       loop:
-; ALL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
-; ALL-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
-; ALL-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
-; ALL-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
-; ALL-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
-; ALL-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
-; ALL-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
-; ALL-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
-; ALL-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
-; ALL-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
-; ALL-NEXT:    store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
-; ALL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; ALL-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; ALL-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
-; ALL:       exit:
-; ALL-NEXT:    ret void
+; AVX1-LABEL: @fancierRotate2(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
+; AVX1-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
+; AVX1-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    br label [[LOOP:%.*]]
+; AVX1:       loop:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; AVX1-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
+; AVX1-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; AVX1-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
+; AVX1-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
+; AVX1-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP0]])
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP2]])
+; AVX1-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[TMP1]], <8 x i32> [[TMP3]]
+; AVX1-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* [[T5]], align 4
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; AVX1-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX1-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
+; AVX1:       exit:
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @fancierRotate2(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
+; AVX2-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
+; AVX2-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:    br label [[LOOP:%.*]]
+; AVX2:       loop:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; AVX2-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
+; AVX2-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; AVX2-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
+; AVX2-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
+; AVX2-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
+; AVX2-NEXT:    store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; AVX2-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX2-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
+; AVX2:       exit:
+; AVX2-NEXT:    ret void
+;
+; AVX512BW-LABEL: @fancierRotate2(
+; AVX512BW-NEXT:  entry:
+; AVX512BW-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
+; AVX512BW-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
+; AVX512BW-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:    br label [[LOOP:%.*]]
+; AVX512BW:       loop:
+; AVX512BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; AVX512BW-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; AVX512BW-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
+; AVX512BW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
+; AVX512BW-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; AVX512BW-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
+; AVX512BW-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; AVX512BW-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
+; AVX512BW-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
+; AVX512BW-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
+; AVX512BW-NEXT:    store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
+; AVX512BW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; AVX512BW-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX512BW-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
+; AVX512BW:       exit:
+; AVX512BW-NEXT:    ret void
+;
+; XOP-LABEL: @fancierRotate2(
+; XOP-NEXT:  entry:
+; XOP-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
+; XOP-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
+; XOP-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:    br label [[LOOP:%.*]]
+; XOP:       loop:
+; XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; XOP-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, i8* [[CONTROL:%.*]], i64 [[INDEX]]
+; XOP-NEXT:    [[T1:%.*]] = bitcast i8* [[T0]] to <8 x i8>*
+; XOP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[T1]], align 1
+; XOP-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; XOP-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
+; XOP-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
+; XOP-NEXT:    [[T5:%.*]] = bitcast i32* [[T4]] to <8 x i32>*
+; XOP-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[T5]], align 4
+; XOP-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
+; XOP-NEXT:    store <8 x i32> [[ROT]], <8 x i32>* [[T5]], align 4
+; XOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; XOP-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; XOP-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
+; XOP:       exit:
+; XOP-NEXT:    ret void
  ;
  entry:
    %i0 = insertelement <8 x i32> undef, i32 %rot0, i32 0
author	Sanjay Patel <spatel@rotateright.com>
	Fri, 15 May 2020 19:22:30 +0000 (15:22 -0400)
committer	Sanjay Patel <spatel@rotateright.com>
	Sat, 16 May 2020 14:44:47 +0000 (10:44 -0400)
llvm/lib/CodeGen/CodeGenPrepare.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/vector-fshl-256.ll		patch \| blob \| history
llvm/test/Transforms/CodeGenPrepare/X86/vec-shift.ll		patch \| blob \| history