[SROA] Try harder to find a vector promotion viable type when rewriting

author Vang Thao <Vang.Thao@amd.com>

Fri, 10 Jun 2022 19:06:15 +0000 (12:06 -0700)

committer Vang Thao <Vang.Thao@amd.com>

Mon, 8 Aug 2022 18:04:01 +0000 (11:04 -0700)
author Vang Thao <Vang.Thao@amd.com>
Fri, 10 Jun 2022 19:06:15 +0000 (12:06 -0700)
committer Vang Thao <Vang.Thao@amd.com>
Mon, 8 Aug 2022 18:04:01 +0000 (11:04 -0700)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp

index 8df86ce..cb2c43e 100644 (file)
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1847,6 +1847,34 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
    return true;
  }
  
+/// Test whether a vector type is viable for promotion.
+///
+/// This implements the necessary checking for \c isVectorPromotionViable over
+/// all slices of the alloca for the given VectorType.
+static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
+                                        const DataLayout &DL) {
+  uint64_t ElementSize =
+      DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
+
+  // While the definition of LLVM vectors is bitpacked, we don't support sizes
+  // that aren't byte sized.
+  if (ElementSize % 8)
+    return false;
+  assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
+         "vector size not a multiple of element size?");
+  ElementSize /= 8;
+
+  for (const Slice &S : P)
+    if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+      return false;
+
+  for (const Slice *S : P.splitSliceTails())
+    if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+      return false;
+
+  return true;
+}
+
  /// Test whether the given alloca partitioning and range of slices can be
  /// promoted to a vector.
  ///
@@ -1939,31 +1967,8 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
      CandidateTys.resize(1);
    }
  
-  // Try each vector type, and return the one which works.
-  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
-    uint64_t ElementSize =
-        DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
-
-    // While the definition of LLVM vectors is bitpacked, we don't support sizes
-    // that aren't byte sized.
-    if (ElementSize % 8)
-      return false;
-    assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
-           "vector size not a multiple of element size?");
-    ElementSize /= 8;
-
-    for (const Slice &S : P)
-      if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
-        return false;
-
-    for (const Slice *S : P.splitSliceTails())
-      if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
-        return false;
-
-    return true;
-  };
    for (VectorType *VTy : CandidateTys)
-    if (CheckVectorTypeForPromotion(VTy))
+    if (checkVectorTypeForPromotion(P, VTy, DL))
        return VTy;
  
    return nullptr;
@@ -4246,26 +4251,45 @@ AllocaInst *SROAPass::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
    // won't always succeed, in which case we fall back to a legal integer type
    // or an i8 array of an appropriate size.
    Type *SliceTy = nullptr;
+  VectorType *SliceVecTy = nullptr;
    const DataLayout &DL = AI.getModule()->getDataLayout();
    std::pair<Type *, IntegerType *> CommonUseTy =
        findCommonType(P.begin(), P.end(), P.endOffset());
    // Do all uses operate on the same type?
    if (CommonUseTy.first)
-    if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())
+    if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size()) {
        SliceTy = CommonUseTy.first;
+      SliceVecTy = dyn_cast<VectorType>(SliceTy);
+    }
    // If not, can we find an appropriate subtype in the original allocated type?
    if (!SliceTy)
      if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
                                                   P.beginOffset(), P.size()))
        SliceTy = TypePartitionTy;
+
    // If still not, can we use the largest bitwidth integer type used?
    if (!SliceTy && CommonUseTy.second)
-    if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())
+    if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size()) {
        SliceTy = CommonUseTy.second;
+      SliceVecTy = dyn_cast<VectorType>(SliceTy);
+    }
    if ((!SliceTy || (SliceTy->isArrayTy() &&
                      SliceTy->getArrayElementType()->isIntegerTy())) &&
-      DL.isLegalInteger(P.size() * 8))
+      DL.isLegalInteger(P.size() * 8)) {
      SliceTy = Type::getIntNTy(*C, P.size() * 8);
+  }
+
+  // If the common use types are not viable for promotion then attempt to find
+  // another type that is viable.
+  if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL))
+    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
+                                                 P.beginOffset(), P.size())) {
+      VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
+      if (TypePartitionVecTy &&
+          checkVectorTypeForPromotion(P, TypePartitionVecTy, DL))
+        SliceTy = TypePartitionTy;
+    }
+
    if (!SliceTy)
      SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
    assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
diff --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll

new file mode 100644 (file)

index 0000000..00cbe56
--- /dev/null
+++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
@@ -0,0 +1,411 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=sroa -S < %s | FileCheck %s
+
+%"struct.a" = type { <8 x half> }
+%"struct.b" = type { %"struct.a" }
+%"struct.c" = type { %"struct.a", i32, i8 }
+%"struct.d" = type { [4 x i32], %"struct.a" }
+%"struct.e" = type { [2 x <8 x half>], i32, i32 }
+%"struct.f" = type { [2 x <8 x i16>], i32, i32 }
+%"array.a" = type [2 x <8 x half>]
+%"array.b" = type [2 x %"struct.a"]
+
+define amdgpu_kernel void @test_zeroinit() #0 {
+; CHECK-LABEL: @test_zeroinit(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.b", align 16
+  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_memset() #0 {
+; CHECK-LABEL: @test_memset(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.b", align 16
+  call void @llvm.memset.p0.i64(ptr align 16 %b_blockwise_copy, i8 0, i64 16, i1 false)
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+; Initial SROA pass failed to promote alloca and same alloca type was re-used
+; so alloca was not re-added to the worklist after initial SROA pass. This
+; caused it to fail to promote unlike the other tests.
+define amdgpu_kernel void @vector_type_alloca() #0 {
+; CHECK-LABEL: @vector_type_alloca(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY:%.*]] = alloca <8 x half>, align 16
+; CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[B_BLOCKWISE_COPY]], align 16
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[B_BLOCKWISE_COPY]], align 16
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY]], align 16
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_2_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_2_PTR2_SROA_IDX]], align 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_4_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY]], i64 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_4_PTR3_SROA_IDX]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca <8 x half>, align 16
+  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_struct_contain_multiple_types1() #0 {
+; CHECK-LABEL: @test_struct_contain_multiple_types1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.c", align 16
+  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_struct_contain_multiple_types2() #0 {
+; CHECK-LABEL: @test_struct_contain_multiple_types2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA1:%.*]] = load [4 x i32], ptr undef, align 4
+; CHECK-NEXT:    [[DATA1_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 0
+; CHECK-NEXT:    [[DATA1_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 1
+; CHECK-NEXT:    [[DATA1_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 2
+; CHECK-NEXT:    [[DATA1_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 3
+; CHECK-NEXT:    [[DATA2:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA2]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.d", align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 16, i1 false)
+  %data1 = load [4 x i32], [4 x i32]* undef
+  store [4 x i32] %data1, ptr %b_blockwise_copy, align 16
+  %data2_gep = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  store <8 x half> zeroinitializer, ptr %data2_gep, align 16
+  %data2 = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data2, ptr %data2_gep, align 16
+  br label %bb
+
+bb:
+  %ptr1 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  %load1 = load half, ptr %ptr1, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 18
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 20
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_struct_array_vector() #0 {
+; CHECK-LABEL: @test_struct_array_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA0:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA0]] to <8 x half>
+; CHECK-NEXT:    [[DATA1:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[DATA1]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_3_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.e", align 16
+  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
+  %0 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  store <8 x half> zeroinitializer, ptr %0, align 16
+  %data0 = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data0, ptr %b_blockwise_copy, align 16
+  %data1 = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data1, ptr %0, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  %load2 = load half, ptr %ptr2, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_struct_array_vector_i16() #0 {
+; CHECK-LABEL: @test_struct_array_vector_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x i32>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[DATA]] to <8 x i16>
+; CHECK-NEXT:    [[DATA2:%.*]] = load <4 x i32>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[DATA2]] to <8 x i16>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.f", align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
+  %data = load <4 x i32>, <4 x i32>* undef
+  store <4 x i32> %data, ptr %b_blockwise_copy, align 16
+  %data2 = load <4 x i32>, <4 x i32>* undef
+  %data2_gep = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  store <4 x i32> %data2, ptr %data2_gep, align 16
+  br label %bb
+
+bb:
+  %load1 = load i16, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load i16, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  %load3 = load i16, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_half_array() #0 {
+; CHECK-LABEL: @test_half_array(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float undef to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float undef to i32
+; CHECK-NEXT:    [[DATA:%.*]] = load [4 x float], ptr undef, align 4
+; CHECK-NEXT:    [[DATA_FCA_0_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 0
+; CHECK-NEXT:    store float [[DATA_FCA_0_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT:    [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 1
+; CHECK-NEXT:    store float [[DATA_FCA_1_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    [[DATA_FCA_2_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 2
+; CHECK-NEXT:    [[DATA_FCA_3_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 3
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca [8 x half], align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 16, i1 false)
+  %data = load [4 x float], [4 x float]* undef
+  store [4 x float] %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_array_vector() #0 {
+; CHECK-LABEL: @test_array_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"array.a", align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_array_vector2() #0 {
+; CHECK-LABEL: @test_array_vector2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"array.b", align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_array_vector_no_vector_common_type() #0 {
+; CHECK-LABEL: @test_array_vector_no_vector_common_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7:%.*]] = alloca float, align 8
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10:%.*]] = alloca float, align 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_13:%.*]] = alloca <8 x half>, align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 8 [[B_BLOCKWISE_COPY_SROA_7]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_10]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_13]], i8 0, i32 16, i1 false)
+; CHECK-NEXT:    [[DATA1:%.*]] = load float, ptr undef, align 4
+; CHECK-NEXT:    [[DATA2:%.*]] = load float, ptr undef, align 4
+; CHECK-NEXT:    [[DATA3:%.*]] = load float, ptr undef, align 4
+; CHECK-NEXT:    [[DATA4:%.*]] = load float, ptr undef, align 4
+; CHECK-NEXT:    store float [[DATA1]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT:    store float [[DATA2]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    store float [[DATA3]], ptr [[B_BLOCKWISE_COPY_SROA_7]], align 8
+; CHECK-NEXT:    store float [[DATA4]], ptr [[B_BLOCKWISE_COPY_SROA_10]], align 4
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_2_PTR4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_4]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_2_B_BLOCKWISE_COPY_SROA_4_6_LOAD4:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4_2_PTR4_SROA_IDX]], align 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_0_B_BLOCKWISE_COPY_SROA_7_8_LOAD5:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_7]], align 8
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_2_PTR6_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_7]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_2_B_BLOCKWISE_COPY_SROA_7_10_LOAD6:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_7_2_PTR6_SROA_IDX]], align 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_0_B_BLOCKWISE_COPY_SROA_10_12_LOAD7:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_10]], align 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_2_PTR8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_10]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_2_B_BLOCKWISE_COPY_SROA_10_14_LOAD8:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_10_2_PTR8_SROA_IDX]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"array.a", align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
+  %data1 = load float, float* undef
+  %data2 = load float, float* undef
+  %data3 = load float, float* undef
+  %data4 = load float, float* undef
+  store float %data1, ptr %b_blockwise_copy, align 16
+  %data_ptr1 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  store float %data2, ptr %data_ptr1, align 16
+  %data_ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 8
+  store float %data3, ptr %data_ptr2, align 16
+  %data_ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 12
+  store float %data4, ptr %data_ptr3, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  %ptr4 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 6
+  %load4 = load half, ptr %ptr4, align 16
+  %ptr5 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 8
+  %load5 = load half, ptr %ptr5, align 16
+  %ptr6 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 10
+  %load6 = load half, ptr %ptr6, align 16
+  %ptr7 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 12
+  %load7 = load half, ptr %ptr7, align 16
+  %ptr8 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 14
+  %load8 = load half, ptr %ptr8, align 16
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) nounwind
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1) nounwind
+attributes #0 = { nounwind readonly }
+
author	Vang Thao <Vang.Thao@amd.com>
	Fri, 10 Jun 2022 19:06:15 +0000 (12:06 -0700)
committer	Vang Thao <Vang.Thao@amd.com>
	Mon, 8 Aug 2022 18:04:01 +0000 (11:04 -0700)
llvm/lib/Transforms/Scalar/SROA.cpp		patch \| blob \| history
llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll	[new file with mode: 0644]	patch \| blob