From: Mateja Marjanovic Date: Mon, 15 May 2023 16:20:50 +0000 (+0200) Subject: [AMDGPU] Trim zero components from buffer and image stores X-Git-Tag: upstream/17.0.6~8502 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3181a6e3e7dae9292782216a55c5e1f0583c1668;p=platform%2Fupstream%2Fllvm.git [AMDGPU] Trim zero components from buffer and image stores For image and buffer stores the default behaviour on GFX11 and older is to set all unset components to zero. So if we pass only X component it will be the same as X000, or XY same as XY00. This patch simplifies the passed vector of components in InstCombine by removing zero components from the end. For image stores it also trims DMask if necessary. Reviewed By: foad, arsenm Differential Revision: https://reviews.llvm.org/D146737 --- diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 48960ee..6bc751a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -872,10 +872,12 @@ defset list AMDGPUImageDimIntrinsics = { defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< "STORE", [], [AMDGPUArg], - [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>; + [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>, + AMDGPUImageDMaskIntrinsic; defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< "STORE_MIP", [], [AMDGPUArg], - [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>; + [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>, + AMDGPUImageDMaskIntrinsic; ////////////////////////////////////////////////////////////////////////// // MSAA intrinsics diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 48a6fde..9b9cec65 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -355,6 +355,36 @@ bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I, return false; } +// Trim all zero components from the end of the vector \p UseV and return +// an appropriate bitset with known elements. +static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, + Instruction *I) { + auto *VTy = cast(UseV->getType()); + unsigned VWidth = VTy->getNumElements(); + APInt DemandedElts = APInt::getAllOnes(VWidth); + + for (int i = VWidth - 1; i >= 0; --i) { + APInt DemandOneElt = APInt::getOneBitSet(VWidth, i); + KnownFPClass KnownFPClass = + computeKnownFPClass(UseV, DemandOneElt, IC.getDataLayout(), + /*InterestedClasses=*/fcAllFlags, + /*Depth=*/0, &IC.getTargetLibraryInfo(), + &IC.getAssumptionCache(), I, + &IC.getDominatorTree(), + &IC.getOptimizationRemarkEmitter()); + if (KnownFPClass.KnownFPClasses != fcPosZero) + break; + DemandedElts.clearBit(i); + } + return DemandedElts; +} + +static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, + IntrinsicInst &II, + APInt DemandedElts, + int DMaskIdx = -1, + bool IsLoad = true); + std::optional GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -1054,26 +1084,65 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); break; } - default: { - if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = - AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { - return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); + case Intrinsic::amdgcn_buffer_store: + case Intrinsic::amdgcn_buffer_store_format: + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_tbuffer_store: + case Intrinsic::amdgcn_struct_buffer_store: + case Intrinsic::amdgcn_struct_buffer_store_format: + case Intrinsic::amdgcn_struct_tbuffer_store: + case Intrinsic::amdgcn_tbuffer_store: + case Intrinsic::amdgcn_image_store_1d: + case Intrinsic::amdgcn_image_store_1darray: + case Intrinsic::amdgcn_image_store_2d: + case Intrinsic::amdgcn_image_store_2darray: + case Intrinsic::amdgcn_image_store_2darraymsaa: + case Intrinsic::amdgcn_image_store_2dmsaa: + case Intrinsic::amdgcn_image_store_3d: + case Intrinsic::amdgcn_image_store_cube: + case Intrinsic::amdgcn_image_store_mip_1d: + case Intrinsic::amdgcn_image_store_mip_1darray: + case Intrinsic::amdgcn_image_store_mip_2d: + case Intrinsic::amdgcn_image_store_mip_2darray: + case Intrinsic::amdgcn_image_store_mip_3d: + case Intrinsic::amdgcn_image_store_mip_cube: { + if (!isa(II.getArgOperand(0)->getType())) + break; + + APInt DemandedElts = + trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); + + int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1; + if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx, + false)) { + return IC.eraseInstFromFunction(II); } + + break; + } } + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { + return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); } return std::nullopt; } /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. /// +/// The result of simplifying amdgcn image and buffer store intrinsics is updating +/// definitions of the intrinsics vector argument, not Uses of the result like +/// image and buffer loads. /// Note: This only supports non-TFE/LWE image intrinsic calls; those have /// struct returns. static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, - int DMaskIdx = -1) { + int DMaskIdx, bool IsLoad) { - auto *IIVTy = cast(II.getType()); + auto *IIVTy = cast(IsLoad ? II.getType() + : II.getOperand(0)->getType()); unsigned VWidth = IIVTy->getNumElements(); if (VWidth == 1) return nullptr; @@ -1144,13 +1213,13 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1; unsigned NewDMaskVal = 0; - unsigned OrigLoadIdx = 0; + unsigned OrigLdStIdx = 0; for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { const unsigned Bit = 1 << SrcIdx; if (!!(DMaskVal & Bit)) { - if (!!DemandedElts[OrigLoadIdx]) + if (!!DemandedElts[OrigLdStIdx]) NewDMaskVal |= Bit; - OrigLoadIdx++; + OrigLdStIdx++; } } @@ -1178,29 +1247,45 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); OverloadTys[0] = NewTy; + if (!IsLoad) { + SmallVector EltMask; + for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx) + if (DemandedElts[OrigStoreIdx]) + EltMask.push_back(OrigStoreIdx); + + if (NewNumElts == 1) + Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]); + else + Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask); + } + Function *NewIntrin = Intrinsic::getDeclaration( II.getModule(), II.getIntrinsicID(), OverloadTys); CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); NewCall->takeName(&II); NewCall->copyMetadata(II); - if (NewNumElts == 1) { - return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall, - DemandedElts.countr_zero()); - } + if (IsLoad) { + if (NewNumElts == 1) { + return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall, + DemandedElts.countr_zero()); + } - SmallVector EltMask; - unsigned NewLoadIdx = 0; - for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { - if (!!DemandedElts[OrigLoadIdx]) - EltMask.push_back(NewLoadIdx++); - else - EltMask.push_back(NewNumElts); - } + SmallVector EltMask; + unsigned NewLoadIdx = 0; + for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { + if (!!DemandedElts[OrigLoadIdx]) + EltMask.push_back(NewLoadIdx++); + else + EltMask.push_back(NewNumElts); + } + + auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); - Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); + return Shuffle; + } - return Shuffle; + return NewCall; } std::optional GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 34bd96c..ae7d541 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -66,7 +66,7 @@ define double @test_constant_fold_rcp_f64_43() nounwind { define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp { ; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR14:[0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR13:[0-9]+]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone @@ -107,7 +107,7 @@ define double @test_constant_fold_sqrt_f64_undef() nounwind { define half @test_constant_fold_sqrt_f16_0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f16_0( -; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR15:[0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR14:[0-9]+]] ; CHECK-NEXT: ret half [[VAL]] ; %val = call half @llvm.amdgcn.sqrt.f16(half 0.0) nounwind readnone @@ -116,7 +116,7 @@ define half @test_constant_fold_sqrt_f16_0() nounwind { define float @test_constant_fold_sqrt_f32_0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f32_0( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.sqrt.f32(float 0.0) nounwind readnone @@ -125,7 +125,7 @@ define float @test_constant_fold_sqrt_f32_0() nounwind { define double @test_constant_fold_sqrt_f64_0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f64_0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.sqrt.f64(double 0.0) nounwind readnone @@ -134,7 +134,7 @@ define double @test_constant_fold_sqrt_f64_0() nounwind { define half @test_constant_fold_sqrt_f16_neg0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f16_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR14]] ; CHECK-NEXT: ret half [[VAL]] ; %val = call half @llvm.amdgcn.sqrt.f16(half -0.0) nounwind readnone @@ -143,7 +143,7 @@ define half @test_constant_fold_sqrt_f16_neg0() nounwind { define float @test_constant_fold_sqrt_f32_neg0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f32_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.sqrt.f32(float -0.0) nounwind readnone @@ -152,7 +152,7 @@ define float @test_constant_fold_sqrt_f32_neg0() nounwind { define double @test_constant_fold_sqrt_f64_neg0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f64_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR14]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.sqrt.f64(double -0.0) nounwind readnone @@ -644,7 +644,7 @@ define i1 @test_class_isnan_f32(float %x) nounwind { define i1 @test_class_isnan_f32_strict(float %x) nounwind { ; CHECK-LABEL: @test_class_isnan_f32_strict( -; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 3) #[[ATTR16:[0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 3) #[[ATTR15:[0-9]+]] ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 3) strictfp @@ -662,7 +662,7 @@ define i1 @test_class_is_p0_n0_f32(float %x) nounwind { define i1 @test_class_is_p0_n0_f32_strict(float %x) nounwind { ; CHECK-LABEL: @test_class_is_p0_n0_f32_strict( -; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 96) #[[ATTR16]] +; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 96) #[[ATTR15]] ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 96) strictfp @@ -1275,8 +1275,8 @@ define i32 @ubfe_offset_0_width_0(i32 %src) { define i32 @ubfe_offset_0_width_3(i32 %src) { ; CHECK-LABEL: @ubfe_offset_0_width_3( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[SRC:%.*]], 7 -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: [[BFE:%.*]] = and i32 [[SRC:%.*]], 7 +; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 3) ret i32 %bfe @@ -1793,7 +1793,7 @@ define i64 @icmp_constant_inputs_false() { define i64 @icmp_constant_inputs_true() { ; CHECK-LABEL: @icmp_constant_inputs_true( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR17:[0-9]+]] +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR16:[0-9]+]] ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 34) @@ -2500,7 +2500,7 @@ define i64 @fcmp_constant_inputs_false() { define i64 @fcmp_constant_inputs_true() { ; CHECK-LABEL: @fcmp_constant_inputs_true( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]] +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR16]] ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 4) @@ -2542,7 +2542,7 @@ define i64 @ballot_zero_64() { define i64 @ballot_one_64() { ; CHECK-LABEL: @ballot_one_64( -; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]] +; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR16]] ; CHECK-NEXT: ret i64 [[B]] ; %b = call i64 @llvm.amdgcn.ballot.i64(i1 1) @@ -2568,7 +2568,7 @@ define i32 @ballot_zero_32() { define i32 @ballot_one_32() { ; CHECK-LABEL: @ballot_one_32( -; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR17]] +; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR16]] ; CHECK-NEXT: ret i32 [[B]] ; %b = call i32 @llvm.amdgcn.ballot.i32(i1 1) @@ -5586,7 +5586,7 @@ define double @trig_preop_constfold() { define double @trig_preop_constfold_strictfp() { ; CHECK-LABEL: @trig_preop_constfold_strictfp( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR16]] +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR15]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) strictfp diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll new file mode 100644 index 0000000..7f88108 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mcpu=gfx900 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s +; RUN: opt -mcpu=gfx1010 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s +; RUN: opt -mcpu=gfx1100 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s + +define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg %rsrc, float %vdata1, i32 %s) #0 { +; GCN-LABEL: @image_store_1d_store_insert_zeros_at_end( +; GCN-NEXT: call void @llvm.amdgcn.image.store.1d.f32.i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %newvdata4, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_mip_1d_store_insert_zeros_at_end(<8 x i32> inreg %rsrc, float %vdata1, float %vdata2, i32 %s, i32 %mip) #0 { +; GCN-LABEL: @image_store_mip_1d_store_insert_zeros_at_end( +; GCN-NEXT: [[TMP1:%.*]] = insertelement <3 x float> , float [[VDATA1:%.*]], i64 1 +; GCN-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2 +; GCN-NEXT: call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> [[TMP2]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float %vdata2, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %newvdata4, i32 7, i32 %s, i32 0, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_store_insert_zeros_at_end(<4 x i32> inreg %a, float %vdata1, i32 %b) { +; GCN-LABEL: @buffer_store_insert_zeros_at_end( +; GCN-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[VDATA1:%.*]], i64 0 +; GCN-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer +; GCN-NEXT: call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i1 0, i1 0) + ret void +} + +define amdgpu_ps void @struct_buffer_store_insert_zeros(<4 x i32> inreg %a, float %vdata1, i32 %b) { +; GCN-LABEL: @struct_buffer_store_insert_zeros( +; GCN-NEXT: [[TMP1:%.*]] = insertelement <3 x float> , float [[VDATA1:%.*]], i64 0 +; GCN-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA1]], i64 2 +; GCN-NEXT: call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float %vdata1, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0) + ret void +} + +define amdgpu_ps void @struct_tbuffer_store_insert_zeros_at_beginning(<4 x i32> inreg %a, float %vdata1, i32 %b) { +; GCN-LABEL: @struct_tbuffer_store_insert_zeros_at_beginning( +; GCN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 3 +; GCN-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float %vdata1, i32 3 + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0, i32 15) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 +declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2 +declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2 +declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind writeonly } +attributes #2 = { nounwind }