[InstCombine] allow bitcast to/from FP for vector insert/extract transform

author Sanjay Patel <spatel@rotateright.com>

Thu, 4 Oct 2018 16:25:05 +0000 (16:25 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Thu, 4 Oct 2018 16:25:05 +0000 (16:25 +0000)
author Sanjay Patel <spatel@rotateright.com>
Thu, 4 Oct 2018 16:25:05 +0000 (16:25 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Thu, 4 Oct 2018 16:25:05 +0000 (16:25 +0000)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp

index 945664d..f01f2b0 100644 (file)
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -189,9 +189,7 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
  
    // If the source elements are wider than the destination, try to shift and
    // truncate a subset of scalar bits of an insert op.
-  // TODO: This is limited to integer types, but we could bitcast to/from FP.
-  if (NumSrcElts < NumElts && SrcTy->getScalarType()->isIntegerTy() &&
-      DestTy->getScalarType()->isIntegerTy()) {
+  if (NumSrcElts < NumElts) {
      Value *Scalar;
      uint64_t InsIndexC;
      if (!match(X, m_InsertElement(m_Value(), m_Value(Scalar),
@@ -220,13 +218,42 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
      unsigned Chunk = ExtIndexC % NarrowingRatio;
      if (IsBigEndian)
        Chunk = NarrowingRatio - 1 - Chunk;
-    unsigned ShAmt = Chunk * DestTy->getPrimitiveSizeInBits();
+
+    // Bail out if this is an FP vector to FP vector sequence. That would take
+    // more instructions than we started with unless there is no shift, and it
+    // may not be handled as well in the backend.
+    bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy();
+    bool NeedDestBitcast = DestTy->isFloatingPointTy();
+    if (NeedSrcBitcast && NeedDestBitcast)
+      return nullptr;
+
+    unsigned SrcWidth = SrcTy->getScalarSizeInBits();
+    unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
+    unsigned ShAmt = Chunk * DestWidth;
+
+    // TODO: This limitation is more strict than necessary. We could sum the
+    // number of new instructions and subtract the number eliminated to know if
+    // we can proceed.
+    if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse())
+      if (NeedSrcBitcast || NeedDestBitcast)
+        return nullptr;
+
+    if (NeedSrcBitcast) {
+      Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth);
+      Scalar = Builder.CreateBitCast(Scalar, SrcIntTy);
+    }
+
      if (ShAmt) {
        // Bail out if we could end with more instructions than we started with.
        if (!Ext.getVectorOperand()->hasOneUse())
          return nullptr;
        Scalar = Builder.CreateLShr(Scalar, ShAmt);
      }
+
+    if (NeedDestBitcast) {
+      Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth);
+      return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy);
+    }
      return new TruncInst(Scalar, DestTy);
    }
  
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll

index 2ca3e32..af34a3f 100644 (file)
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -299,9 +299,8 @@ define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
  
  ; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
  ; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
-; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %tmp, i64 0
-; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <8 x i16>
-; CHECK-NEXT: %tmp2 = extractelement <8 x i16> %tmp1, i32 0
+; CHECK-NEXT: %1 = bitcast float %tmp to i32
+; CHECK-NEXT: %tmp2 = trunc i32 %1 to i16
  ; CHECK-NEXT: ret i16 %tmp2
  define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
    %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
diff --git a/llvm/test/Transforms/InstCombine/extractelement.ll b/llvm/test/Transforms/InstCombine/extractelement.ll

index ae91396..5d6a3a1 100644 (file)
--- a/llvm/test/Transforms/InstCombine/extractelement.ll
+++ b/llvm/test/Transforms/InstCombine/extractelement.ll
@@ -164,11 +164,16 @@ define i8 @bitcasted_inselt_wide_source_uses(i32 %x) {
  }
  
  define float @bitcasted_inselt_to_FP(i64 %x) {
-; ANY-LABEL: @bitcasted_inselt_to_FP(
-; ANY-NEXT:    [[I:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
-; ANY-NEXT:    [[B:%.*]] = bitcast <2 x i64> [[I]] to <4 x float>
-; ANY-NEXT:    [[R:%.*]] = extractelement <4 x float> [[B]], i32 1
-; ANY-NEXT:    ret float [[R]]
+; LE-LABEL: @bitcasted_inselt_to_FP(
+; LE-NEXT:    [[TMP1:%.*]] = lshr i64 [[X:%.*]], 32
+; LE-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; LE-NEXT:    [[R:%.*]] = bitcast i32 [[TMP2]] to float
+; LE-NEXT:    ret float [[R]]
+;
+; BE-LABEL: @bitcasted_inselt_to_FP(
+; BE-NEXT:    [[TMP1:%.*]] = trunc i64 [[X:%.*]] to i32
+; BE-NEXT:    [[R:%.*]] = bitcast i32 [[TMP1]] to float
+; BE-NEXT:    ret float [[R]]
  ;
    %i = insertelement <2 x i64> undef, i64 %x, i32 0
    %b = bitcast <2 x i64> %i to <4 x float>
@@ -210,11 +215,16 @@ define float @bitcasted_inselt_to_FP_uses2(i128 %x) {
  }
  
  define i32 @bitcasted_inselt_from_FP(double %x) {
-; ANY-LABEL: @bitcasted_inselt_from_FP(
-; ANY-NEXT:    [[I:%.*]] = insertelement <2 x double> undef, double [[X:%.*]], i32 0
-; ANY-NEXT:    [[B:%.*]] = bitcast <2 x double> [[I]] to <4 x i32>
-; ANY-NEXT:    [[R:%.*]] = extractelement <4 x i32> [[B]], i32 1
-; ANY-NEXT:    ret i32 [[R]]
+; LE-LABEL: @bitcasted_inselt_from_FP(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast double [[X:%.*]] to i64
+; LE-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 32
+; LE-NEXT:    [[R:%.*]] = trunc i64 [[TMP2]] to i32
+; LE-NEXT:    ret i32 [[R]]
+;
+; BE-LABEL: @bitcasted_inselt_from_FP(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast double [[X:%.*]] to i64
+; BE-NEXT:    [[R:%.*]] = trunc i64 [[TMP1]] to i32
+; BE-NEXT:    ret i32 [[R]]
  ;
    %i = insertelement <2 x double> undef, double %x, i32 0
    %b = bitcast <2 x double> %i to <4 x i32>
author	Sanjay Patel <spatel@rotateright.com>
	Thu, 4 Oct 2018 16:25:05 +0000 (16:25 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Thu, 4 Oct 2018 16:25:05 +0000 (16:25 +0000)
llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp		patch \| blob \| history
llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll		patch \| blob \| history
llvm/test/Transforms/InstCombine/extractelement.ll		patch \| blob \| history