From 4f0ed16a46c509a7b8ef09f3c9ae6434d0cf5622 Mon Sep 17 00:00:00 2001 From: Chen Zheng Date: Fri, 9 Dec 2022 00:27:14 -0500 Subject: [PATCH] Reland rGf35a09daebd0a90daa536432e62a2476f708150d and rG63854f91d3ee1056796a5ef27753648396cac6ec [DAGCombiner] handle more store value forwarding When lowering calls on target like PPC, some stack loads will be generated for by value parameters. Node CALLSEQ_START prevents such loads from being combined. Suggested by @RolandF, this patch removes the unnecessary loads for the byval parameter by extending ForwardStoreValueToDirectLoad Reviewed By: nemanjai, RolandF Differential Revision: https://reviews.llvm.org/D138899 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 55 ++++++- llvm/test/CodeGen/AMDGPU/ctpop16.ll | 38 ++--- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 80 ++++------ llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 172 +++++++++------------ llvm/test/CodeGen/AMDGPU/shl.ll | 42 ++--- llvm/test/CodeGen/AMDGPU/sra.ll | 55 +++---- llvm/test/CodeGen/Mips/o32_cc_byval.ll | 5 +- llvm/test/CodeGen/PowerPC/aix-cc-byval.ll | 16 +- llvm/test/CodeGen/PowerPC/byval-lhs.ll | 8 +- .../CodeGen/PowerPC/ppc64-byval-larger-struct.ll | 6 - llvm/test/CodeGen/X86/fastcc-byval.ll | 3 +- 11 files changed, 221 insertions(+), 259 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4ab490d..9d54ba3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -363,6 +363,11 @@ namespace { SDValue SplitIndexingFromLoad(LoadSDNode *LD); bool SliceUpLoad(SDNode *N); + // Looks up the chain to find a unique (unaliased) store feeding the passed + // load. If no such store is found, returns a nullptr. + // Note: This will look past a CALLSEQ_START if the load is chained to it so + // so that it can find stack stores for byval params. + StoreSDNode *getUniqueStoreFeeding(LoadSDNode *LD, int64_t &Offset); // Scalars have size 0 to distinguish from singleton vectors. SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD); bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val); @@ -17787,11 +17792,53 @@ bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) { return false; } +StoreSDNode *DAGCombiner::getUniqueStoreFeeding(LoadSDNode *LD, + int64_t &Offset) { + SDValue Chain = LD->getOperand(0); + + // Look through CALLSEQ_START. + if (Chain.getOpcode() == ISD::CALLSEQ_START) + Chain = Chain->getOperand(0); + + StoreSDNode *ST = nullptr; + SmallVector Aliases; + if (Chain.getOpcode() == ISD::TokenFactor) { + // Look for unique store within the TokenFactor. + for (SDValue Op : Chain->ops()) { + StoreSDNode *Store = dyn_cast(Op.getNode()); + if (!Store) + continue; + BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); + BaseIndexOffset BasePtrST = BaseIndexOffset::match(Store, DAG); + if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) + continue; + // Make sure the store is not aliased with any nodes in TokenFactor. + GatherAllAliases(Store, Chain, Aliases); + if (Aliases.empty() || + (Aliases.size() == 1 && Aliases.front().getNode() == Store)) + ST = Store; + break; + } + } else { + StoreSDNode *Store = dyn_cast(Chain.getNode()); + if (Store) { + BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); + BaseIndexOffset BasePtrST = BaseIndexOffset::match(Store, DAG); + if (BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) + ST = Store; + } + } + + return ST; +} + SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { if (OptLevel == CodeGenOpt::None || !LD->isSimple()) return SDValue(); SDValue Chain = LD->getOperand(0); - StoreSDNode *ST = dyn_cast(Chain.getNode()); + int64_t Offset; + + StoreSDNode *ST = getUniqueStoreFeeding(LD, Offset); // TODO: Relax this restriction for unordered atomics (see D66309) if (!ST || !ST->isSimple() || ST->getAddressSpace() != LD->getAddressSpace()) return SDValue(); @@ -17819,12 +17866,6 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { if (LdStScalable && DAG.getDataLayout().isBigEndian()) return SDValue(); - BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); - BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG); - int64_t Offset; - if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) - return SDValue(); - // Normalize for Endianness. After this Offset=0 will denote that the least // significant bit in the loaded value maps to the least significant bit in // the stored value). With Offset=n (for n > 0) the loaded value starts at the diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 1b2bca5..0acd24d 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -457,54 +457,50 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; ; EG-LABEL: v_ctpop_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 42, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: VTX_READ_64 T8.XY, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: MOV T2.X, T0.X, -; EG-NEXT: MOV * T3.X, T0.Y, -; EG-NEXT: MOV T0.X, T4.X, -; EG-NEXT: MOV * T0.Y, PV.X, -; EG-NEXT: AND_INT * T0.W, PV.Y, literal.x, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: AND_INT * T0.W, T8.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV T0.X, T3.X, ; EG-NEXT: MOV * T4.X, PV.W, -; EG-NEXT: MOV T0.Z, PS, -; EG-NEXT: LSHR * T0.W, T0.Y, literal.x, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: LSHR * T0.W, T8.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, ; EG-NEXT: MOV T4.X, PV.W, -; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: AND_INT * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T0.X, T5.X, +; EG-NEXT: AND_INT * T0.W, T8.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, ; EG-NEXT: MOV * T5.X, PV.W, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T0.X, literal.x, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: LSHR * T0.W, T8.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 089924f..885a800 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1249,7 +1249,7 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1258,17 +1258,13 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV T2.X, T5.X, -; EG-NEXT: MOV * T3.X, T5.Y, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: MOV * T0.Z, PS, -; EG-NEXT: LSHR * T5.W, PV.Z, literal.x, +; EG-NEXT: LSHR * T5.W, T5.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T5.Z, T0.Z, literal.x, +; EG-NEXT: AND_INT * T5.Z, T5.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR * T5.Y, T0.Y, literal.x, +; EG-NEXT: LSHR * T5.Y, T5.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T5.X, T0.Y, literal.x, +; EG-NEXT: AND_INT T5.X, T5.X, literal.x, ; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) %load = load <4 x i16>, ptr addrspace(4) %in @@ -1342,8 +1338,8 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 +; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: @@ -1351,20 +1347,16 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV T2.X, T5.X, -; EG-NEXT: MOV * T3.X, T5.Y, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: MOV * T0.Z, PS, -; EG-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T0.Z, literal.x, +; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T5.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T0.Y, literal.x, +; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T5.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, -; EG-NEXT: BFE_INT * T5.Y, PS, 0.0, literal.y, +; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, +; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) %load = load <4 x i16>, ptr addrspace(4) %in %ext = sext <4 x i16> %load to <4 x i32> @@ -4879,29 +4871,25 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1 +; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV T2.X, T5.X, -; EG-NEXT: MOV * T3.X, T5.Y, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: MOV * T0.Z, PS, -; EG-NEXT: LSHR * T5.Z, PV.Z, literal.x, +; EG-NEXT: LSHR * T6.Z, T5.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T5.X, T0.Z, literal.x, -; EG-NEXT: MOV T5.Y, 0.0, -; EG-NEXT: LSHR T6.Z, T0.Y, literal.y, -; EG-NEXT: AND_INT * T6.X, T0.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: AND_INT T6.X, T5.Y, literal.x, ; EG-NEXT: MOV T6.Y, 0.0, -; EG-NEXT: MOV T5.W, 0.0, -; EG-NEXT: MOV * T6.W, 0.0, +; EG-NEXT: LSHR T5.Z, T5.X, literal.y, +; EG-NEXT: AND_INT * T5.X, T5.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: MOV T5.Y, 0.0, +; EG-NEXT: MOV T6.W, 0.0, +; EG-NEXT: MOV * T5.W, 0.0, ; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -5003,7 +4991,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 ; EG-NEXT: CF_END @@ -5012,21 +5000,17 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV T2.X, T5.X, -; EG-NEXT: MOV * T3.X, T5.Y, -; EG-NEXT: MOV T0.Y, PS, -; EG-NEXT: MOV * T0.Z, PV.X, -; EG-NEXT: ASHR * T5.W, PV.Z, literal.x, +; EG-NEXT: ASHR * T5.W, T5.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, -; EG-NEXT: ASHR T5.Z, T0.Z, literal.y, -; EG-NEXT: ASHR * T7.W, T0.Y, literal.z, +; EG-NEXT: ASHR T5.Z, T5.X, literal.y, +; EG-NEXT: ASHR * T7.W, T5.Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x, -; EG-NEXT: ASHR * T7.Z, T0.Y, literal.x, +; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x, +; EG-NEXT: ASHR * T7.Z, T5.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x, ; EG-NEXT: ASHR T5.Y, PV.X, literal.y, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 89710d9..8a7cdf3 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1636,7 +1636,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD @@ -1645,17 +1645,13 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV T2.X, T5.X, -; EG-NEXT: MOV * T3.X, T5.Y, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: MOV * T0.Z, PS, -; EG-NEXT: LSHR * T5.W, PV.Z, literal.x, +; EG-NEXT: LSHR * T5.W, T5.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T5.Z, T0.Z, literal.x, +; EG-NEXT: AND_INT * T5.Z, T5.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR * T5.Y, T0.Y, literal.x, +; EG-NEXT: LSHR * T5.Y, T5.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T5.X, T0.Y, literal.x, +; EG-NEXT: AND_INT T5.X, T5.X, literal.x, ; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) ; @@ -1663,7 +1659,7 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; CM: ; %bb.0: ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X ; CM-NEXT: CF_END ; CM-NEXT: PAD @@ -1672,17 +1668,13 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; CM-NEXT: ALU clause starting at 8: ; CM-NEXT: MOV * T5.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: MOV * T2.X, T5.X, -; CM-NEXT: MOV T3.X, T5.Y, -; CM-NEXT: MOV * T0.Y, PV.X, -; CM-NEXT: MOV * T0.Z, PV.X, -; CM-NEXT: LSHR * T5.W, PV.Z, literal.x, +; CM-NEXT: LSHR * T5.W, T5.Y, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT * T5.Z, T0.Z, literal.x, +; CM-NEXT: AND_INT * T5.Z, T5.Y, literal.x, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; CM-NEXT: LSHR * T5.Y, T0.Y, literal.x, +; CM-NEXT: LSHR * T5.Y, T5.X, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT * T5.X, T0.Y, literal.x, +; CM-NEXT: AND_INT * T5.X, T5.X, literal.x, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) @@ -1760,8 +1752,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 +; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: @@ -1769,28 +1761,24 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV T2.X, T5.X, -; EG-NEXT: MOV * T3.X, T5.Y, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: MOV * T0.Z, PS, -; EG-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x, +; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T0.Z, literal.x, +; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T5.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T0.Y, literal.x, +; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR * T0.W, T5.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, -; EG-NEXT: BFE_INT * T5.Y, PS, 0.0, literal.y, +; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, +; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; ; CM-LABEL: global_sextload_v4i16_to_v4i32: ; CM: ; %bb.0: ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X +; CM-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T5.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 6: @@ -1798,20 +1786,16 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; CM-NEXT: ALU clause starting at 8: ; CM-NEXT: MOV * T5.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: MOV * T2.X, T5.X, -; CM-NEXT: MOV T3.X, T5.Y, -; CM-NEXT: MOV * T0.Y, PV.X, -; CM-NEXT: MOV * T0.Z, PV.X, -; CM-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x, -; CM-NEXT: LSHR * T0.W, T0.Z, literal.x, +; CM-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x, +; CM-NEXT: LSHR * T0.W, T5.Y, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T0.Z, T0.Y, literal.x, -; CM-NEXT: BFE_INT * T5.W, PV.W, 0.0, literal.x, +; CM-NEXT: LSHR T0.Z, T5.X, literal.x, +; CM-NEXT: BFE_INT * T6.W, PV.W, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T6.X, KC0[2].Y, literal.x, -; CM-NEXT: BFE_INT * T5.Y, PV.Z, 0.0, literal.y, +; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x, +; CM-NEXT: BFE_INT * T6.Y, PV.Z, 0.0, literal.y, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) %load = load <4 x i16>, ptr addrspace(1) %in %ext = sext <4 x i16> %load to <4 x i32> @@ -5788,29 +5772,25 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1 +; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV T2.X, T5.X, -; EG-NEXT: MOV * T3.X, T5.Y, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: MOV * T0.Z, PS, -; EG-NEXT: LSHR * T5.Z, PV.Z, literal.x, +; EG-NEXT: LSHR * T6.Z, T5.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T5.X, T0.Z, literal.x, -; EG-NEXT: MOV T5.Y, 0.0, -; EG-NEXT: LSHR T6.Z, T0.Y, literal.y, -; EG-NEXT: AND_INT * T6.X, T0.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: AND_INT T6.X, T5.Y, literal.x, ; EG-NEXT: MOV T6.Y, 0.0, -; EG-NEXT: MOV T5.W, 0.0, -; EG-NEXT: MOV * T6.W, 0.0, +; EG-NEXT: LSHR T5.Z, T5.X, literal.y, +; EG-NEXT: AND_INT * T5.X, T5.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: MOV T5.Y, 0.0, +; EG-NEXT: MOV T6.W, 0.0, +; EG-NEXT: MOV * T5.W, 0.0, ; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -5821,30 +5801,26 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; CM: ; %bb.0: ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T8.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T7.X +; CM-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T8.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T7.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 6: ; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: ; CM-NEXT: MOV * T5.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: MOV * T2.X, T5.X, -; CM-NEXT: MOV * T3.X, T5.Y, -; CM-NEXT: MOV T0.Y, PV.X, -; CM-NEXT: MOV * T0.Z, T2.X, -; CM-NEXT: LSHR * T5.Z, PV.Z, literal.x, +; CM-NEXT: LSHR * T6.Z, T5.X, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT T5.X, T0.Z, literal.x, -; CM-NEXT: MOV T5.Y, 0.0, -; CM-NEXT: LSHR * T6.Z, T0.Y, literal.y, -; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T6.X, T0.Y, literal.x, +; CM-NEXT: AND_INT T6.X, T5.X, literal.x, ; CM-NEXT: MOV T6.Y, 0.0, -; CM-NEXT: MOV * T5.W, 0.0, -; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; CM-NEXT: LSHR * T5.Z, T5.Y, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: AND_INT T5.X, T5.Y, literal.x, +; CM-NEXT: MOV T5.Y, 0.0, ; CM-NEXT: MOV * T6.W, 0.0, +; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; CM-NEXT: MOV * T5.W, 0.0, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; CM-NEXT: LSHR * T7.X, PV.W, literal.x, @@ -5945,7 +5921,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; EG: ; %bb.0: ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1 ; EG-NEXT: CF_END @@ -5954,21 +5930,17 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; EG-NEXT: ALU clause starting at 8: ; EG-NEXT: MOV * T5.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV T2.X, T5.X, -; EG-NEXT: MOV * T3.X, T5.Y, -; EG-NEXT: MOV T0.Y, PS, -; EG-NEXT: MOV * T0.Z, PV.X, -; EG-NEXT: ASHR * T5.W, PV.Z, literal.x, +; EG-NEXT: ASHR * T5.W, T5.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, -; EG-NEXT: ASHR T5.Z, T0.Z, literal.y, -; EG-NEXT: ASHR * T7.W, T0.Y, literal.z, +; EG-NEXT: ASHR T5.Z, T5.X, literal.y, +; EG-NEXT: ASHR * T7.W, T5.Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x, -; EG-NEXT: ASHR * T7.Z, T0.Y, literal.x, +; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x, +; EG-NEXT: ASHR * T7.Z, T5.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x, +; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x, ; EG-NEXT: ASHR T5.Y, PV.X, literal.y, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) @@ -5980,35 +5952,31 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; CM: ; %bb.0: ; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X +; CM-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T8.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T7.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 6: ; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: ; CM-NEXT: MOV * T5.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: MOV * T2.X, T5.X, -; CM-NEXT: MOV T3.X, T5.Y, -; CM-NEXT: MOV * T0.Y, PV.X, -; CM-NEXT: MOV * T0.Z, PV.X, -; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, -; CM-NEXT: ASHR * T5.W, PV.Z, literal.y, +; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x, +; CM-NEXT: ASHR * T6.W, T5.Y, literal.y, ; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44) -; CM-NEXT: LSHR T6.X, PV.Z, literal.x, -; CM-NEXT: ASHR T5.Z, T0.Z, literal.y, -; CM-NEXT: ASHR * T7.W, T0.Y, literal.z, +; CM-NEXT: LSHR T7.X, PV.Z, literal.x, +; CM-NEXT: ASHR T6.Z, T5.Y, literal.y, +; CM-NEXT: ASHR * T5.W, T5.X, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x, -; CM-NEXT: ASHR * T7.Z, T0.Y, literal.x, +; CM-NEXT: BFE_INT T6.X, T5.Y, 0.0, literal.x, +; CM-NEXT: ASHR * T5.Z, T5.X, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x, -; CM-NEXT: ASHR * T5.Y, PV.X, literal.y, +; CM-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x, +; CM-NEXT: ASHR * T6.Y, PV.X, literal.y, ; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44) ; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x, -; CM-NEXT: ASHR * T7.Y, PV.X, literal.y, +; CM-NEXT: ASHR * T5.Y, PV.X, literal.y, ; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44) %load = load <4 x i16>, ptr addrspace(1) %in %ext = sext <4 x i16> %load to <4 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 8f99ab7..20c9544 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -681,51 +681,43 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; EG-LABEL: shl_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 51, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 42, @12, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: MOV T0.Y, T6.X, +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: MOV T4.X, T10.X, -; EG-NEXT: MOV * T5.X, T10.Y, -; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: MOV T0.Y, PS, -; EG-NEXT: MOV * T2.X, T10.Z, -; EG-NEXT: MOV T3.X, T10.W, -; EG-NEXT: MOV * T0.Z, T6.X, -; EG-NEXT: MOV * T1.Y, T2.X, -; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: AND_INT * T1.W, T10.Z, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T1.W, T0.X, PV.W, +; EG-NEXT: LSHL * T1.W, T10.X, PV.W, ; EG-NEXT: AND_INT T1.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T2.W, T0.Z, literal.y, +; EG-NEXT: AND_INT * T2.W, T0.Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), -65536(nan) ; EG-NEXT: OR_INT * T1.W, PS, PV.W, -; EG-NEXT: MOV * T0.Z, T3.X, -; EG-NEXT: MOV * T6.X, T1.W, -; EG-NEXT: MOV T1.Z, PV.X, -; EG-NEXT: LSHR T1.W, T1.Y, literal.x, -; EG-NEXT: LSHR * T2.W, T0.X, literal.x, +; EG-NEXT: MOV * T6.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: LSHR T1.W, T10.Z, literal.x, +; EG-NEXT: LSHR * T2.W, T10.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHL T1.W, PS, PV.W, -; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x, +; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LSHL * T1.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, ; EG-NEXT: MOV T6.X, PV.W, ; EG-NEXT: MOV * T0.X, T7.X, -; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, +; EG-NEXT: AND_INT * T1.W, T10.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL T1.W, T0.Y, PV.W, +; EG-NEXT: LSHL T1.W, T10.Y, PV.W, ; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, @@ -733,8 +725,8 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, ; EG-NEXT: MOV * T7.X, PV.W, ; EG-NEXT: MOV T0.X, PV.X, -; EG-NEXT: LSHR T1.W, T0.Z, literal.x, -; EG-NEXT: LSHR * T2.W, T0.Y, literal.x, +; EG-NEXT: LSHR T1.W, T10.W, literal.x, +; EG-NEXT: LSHR * T2.W, T10.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHL * T1.W, PS, PV.W, ; EG-NEXT: AND_INT T0.Z, T0.X, literal.x, diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index cac9179..7097f58 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -323,52 +323,43 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; EG-LABEL: ashr_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 58, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 48, @10, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_128 T9.XYZW, T9.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.Y, T6.X, ; EG-NEXT: MOV * T9.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: MOV T4.X, T9.X, -; EG-NEXT: MOV * T5.X, T9.Y, -; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: MOV * T0.Z, PS, -; EG-NEXT: MOV T2.X, T9.Z, -; EG-NEXT: MOV * T3.X, T9.W, -; EG-NEXT: MOV * T0.W, T6.X, -; EG-NEXT: MOV T1.Y, T2.X, -; EG-NEXT: BFE_INT * T1.W, T0.Y, 0.0, literal.x, -; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, -; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: ASHR * T1.W, T1.W, PV.W, -; EG-NEXT: AND_INT T1.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T0.W, T0.W, literal.y, +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: BFE_INT T0.W, T9.X, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, T9.Z, literal.y, +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; EG-NEXT: ASHR * T0.W, PV.W, PS, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), -65536(nan) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: MOV * T1.Z, T3.X, -; EG-NEXT: MOV * T6.X, T0.W, -; EG-NEXT: MOV T0.W, PV.X, -; EG-NEXT: LSHR * T1.W, T0.Y, literal.x, +; EG-NEXT: MOV * T6.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T9.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T1.W, PS, 0.0, literal.x, -; EG-NEXT: LSHR * T2.W, T1.Y, literal.x, +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR * T1.W, T9.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: ASHR T1.W, PV.W, PS, -; EG-NEXT: AND_INT * T0.W, T0.W, literal.x, +; EG-NEXT: ASHR T0.W, PV.W, PS, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL * T1.W, PV.W, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: OR_INT * T0.W, T0.W, PV.W, +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, ; EG-NEXT: MOV T6.X, PV.W, ; EG-NEXT: MOV T0.Y, T7.X, -; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, T1.Z, literal.y, +; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, T9.W, literal.y, ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) ; EG-NEXT: ASHR T0.W, PV.W, PS, ; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, @@ -378,10 +369,10 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, ; EG-NEXT: MOV * T7.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T0.W, T0.Z, literal.x, +; EG-NEXT: LSHR * T0.W, T9.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T1.W, T1.Z, literal.x, +; EG-NEXT: LSHR * T1.W, T9.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ASHR T0.W, PV.W, PS, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, diff --git a/llvm/test/CodeGen/Mips/o32_cc_byval.ll b/llvm/test/CodeGen/Mips/o32_cc_byval.ll index 94c327b..de6b4dd 100644 --- a/llvm/test/CodeGen/Mips/o32_cc_byval.ll +++ b/llvm/test/CodeGen/Mips/o32_cc_byval.ll @@ -69,8 +69,8 @@ define void @f1() nounwind { ; CHECK-NEXT: sw $1, 16($sp) ; CHECK-NEXT: lw $7, 4($18) ; CHECK-NEXT: lw $6, %lo(f1.s1)($17) -; CHECK-NEXT: lbu $5, 40($sp) ; CHECK-NEXT: lw $25, %call16(callee3)($16) +; CHECK-NEXT: addiu $5, $zero, 11 ; CHECK-NEXT: jalr $25 ; CHECK-NEXT: move $gp, $16 ; CHECK-NEXT: lw $16, 48($sp) # 4-byte Folded Reload @@ -234,6 +234,7 @@ define void @f5(i64 %a0, ptr nocapture byval(%struct.S4) %a1) nounwind { ; CHECK-NEXT: addiu $sp, $sp, -32 ; CHECK-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill ; CHECK-NEXT: addu $gp, $2, $25 +; CHECK-NEXT: move $1, $6 ; CHECK-NEXT: sw $7, 44($sp) ; CHECK-NEXT: sw $6, 40($sp) ; CHECK-NEXT: sw $5, 20($sp) @@ -243,7 +244,7 @@ define void @f5(i64 %a0, ptr nocapture byval(%struct.S4) %a1) nounwind { ; CHECK-NEXT: lw $5, 44($sp) ; CHECK-NEXT: lw $25, %call16(f6)($gp) ; CHECK-NEXT: jalr $25 -; CHECK-NEXT: lw $4, 40($sp) +; CHECK-NEXT: move $4, $1 ; CHECK-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload ; CHECK-NEXT: jr $ra ; CHECK-NEXT: addiu $sp, $sp, 32 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll index 508dd63..5e7a1bc 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll @@ -958,33 +958,33 @@ declare i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4) ; CHECK-LABEL: name: call_test_byval_homogeneous_float_struct{{.*}} ; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1 -; 32BIT-DAG: renamable $r3 = LWZ 0, %stack.0.s :: (load (s32) from %stack.0.s, align 8) ; 32BIT-DAG: renamable $r4 = LWZ 4, %stack.0.s :: (load (s32) from %stack.0.s + 4) ; 32BIT-DAG: renamable $r5 = LWZ 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8) +; 32BIT-DAG: $r3 = LI 0 ; 32BIT-NEXT: BL_NOP , csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r2, implicit-def $r1, implicit-def $r3 ; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1 ; CHECKASM-LABEL: .call_test_byval_homogeneous_float_struct: ; ASM32: stwu 1, -80(1) -; ASM32-DAG: lwz 3, 64(1) ; ASM32-DAG: lwz 4, 68(1) ; ASM32-DAG: lwz 5, 72(1) +; ASM32-DAG: stw 3, 64(1) ; ASM32-NEXT: bl .test_byval_homogeneous_float_struct[PR] ; ASM32-NEXT: nop ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. ; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 -; 64BIT-DAG: renamable $x3 = LD 0, %stack.0.s :: (load (s64) from %stack.0.s) -; 64BIT-DAG: renamable $x4 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8) -; 64BIT-DAG: renamable $x4 = RLDICR killed renamable $x4, 32, 31 +; 64BIT: renamable $x3 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8) +; 64BIT-NEXT: renamable $x4 = RLDICR killed renamable $x3, 32, 31 +; 64BIT-NEXT: $x3 = LI8 0 ; 64BIT-NEXT: BL8_NOP , csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def $x3 ; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. ; ASM64: stdu 1, -128(1) -; ASM64-DAG: ld 3, 112(1) -; ASM64-DAG: lwz 4, 120(1) -; ASM64-DAG: sldi 4, 4, 32 +; ASM64: lwz 3, 120(1) +; ASM64-NEXT: sldi 4, 3, 32 +; ASM64-NEXT: li 3, 0 ; ASM64-NEXT: bl .test_byval_homogeneous_float_struct[PR] ; ASM64-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/byval-lhs.ll b/llvm/test/CodeGen/PowerPC/byval-lhs.ll index 80de181..aef374a 100644 --- a/llvm/test/CodeGen/PowerPC/byval-lhs.ll +++ b/llvm/test/CodeGen/PowerPC/byval-lhs.ll @@ -17,7 +17,6 @@ define void @bar1(i64 %a) nounwind { ; LE-NEXT: stdu r1, -48(r1) ; LE-NEXT: std r0, 64(r1) ; LE-NEXT: std r3, 40(r1) -; LE-NEXT: ld r3, 40(r1) ; LE-NEXT: bl f0 ; LE-NEXT: nop ; LE-NEXT: addi r1, r1, 48 @@ -31,7 +30,6 @@ define void @bar1(i64 %a) nounwind { ; AIX-NEXT: stdu r1, -128(r1) ; AIX-NEXT: std r0, 144(r1) ; AIX-NEXT: std r3, 120(r1) -; AIX-NEXT: ld r3, 120(r1) ; AIX-NEXT: bl .f0[PR] ; AIX-NEXT: nop ; AIX-NEXT: addi r1, r1, 128 @@ -49,11 +47,10 @@ define void @bar2(i64 %a) nounwind { ; LE: # %bb.0: ; LE-NEXT: mflr r0 ; LE-NEXT: stdu r1, -48(r1) +; LE-NEXT: mr r4, r3 ; LE-NEXT: std r0, 64(r1) ; LE-NEXT: std r3, 32(r1) ; LE-NEXT: std r3, 40(r1) -; LE-NEXT: ld r4, 40(r1) -; LE-NEXT: ld r3, 32(r1) ; LE-NEXT: bl f1 ; LE-NEXT: nop ; LE-NEXT: addi r1, r1, 48 @@ -65,11 +62,10 @@ define void @bar2(i64 %a) nounwind { ; AIX: # %bb.0: ; AIX-NEXT: mflr r0 ; AIX-NEXT: stdu r1, -128(r1) +; AIX-NEXT: mr r4, r3 ; AIX-NEXT: std r0, 144(r1) ; AIX-NEXT: std r3, 112(r1) ; AIX-NEXT: std r3, 120(r1) -; AIX-NEXT: ld r4, 120(r1) -; AIX-NEXT: ld r3, 112(r1) ; AIX-NEXT: bl .f1[PR] ; AIX-NEXT: nop ; AIX-NEXT: addi r1, r1, 128 diff --git a/llvm/test/CodeGen/PowerPC/ppc64-byval-larger-struct.ll b/llvm/test/CodeGen/PowerPC/ppc64-byval-larger-struct.ll index 429b877..39b0661 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-byval-larger-struct.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-byval-larger-struct.ll @@ -184,7 +184,6 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P8LE-NEXT: stdx r3, 0, r5 ; P8LE-NEXT: stb r4, 79(r1) ; P8LE-NEXT: lbz r4, 56(r1) -; P8LE-NEXT: ld r3, 48(r1) ; P8LE-NEXT: bl callee_9 ; P8LE-NEXT: nop ; P8LE-NEXT: li r3, 0 @@ -204,7 +203,6 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P9LE-NEXT: std r3, 48(r1) ; P9LE-NEXT: stdx r3, 0, r4 ; P9LE-NEXT: lbz r4, 56(r1) -; P9LE-NEXT: ld r3, 48(r1) ; P9LE-NEXT: stb r5, 79(r1) ; P9LE-NEXT: bl callee_9 ; P9LE-NEXT: nop @@ -225,7 +223,6 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P10LE-NEXT: lbz r5, 56(r1) ; P10LE-NEXT: stdx r3, 0, r4 ; P10LE-NEXT: lbz r4, 56(r1) -; P10LE-NEXT: ld r3, 48(r1) ; P10LE-NEXT: stb r5, 79(r1) ; P10LE-NEXT: bl callee_9@notoc ; P10LE-NEXT: li r3, 0 @@ -246,7 +243,6 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P8BE-NEXT: stdx r3, 0, r5 ; P8BE-NEXT: stb r4, 143(r1) ; P8BE-NEXT: lbz r4, 200(r1) -; P8BE-NEXT: ld r3, 192(r1) ; P8BE-NEXT: bl callee_9 ; P8BE-NEXT: nop ; P8BE-NEXT: li r3, 0 @@ -266,7 +262,6 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P9BE-NEXT: std r3, 192(r1) ; P9BE-NEXT: stdx r3, 0, r4 ; P9BE-NEXT: lbz r4, 200(r1) -; P9BE-NEXT: ld r3, 192(r1) ; P9BE-NEXT: stb r5, 143(r1) ; P9BE-NEXT: bl callee_9 ; P9BE-NEXT: nop @@ -287,7 +282,6 @@ define signext i8 @caller_9_callee_9(ptr nocapture readonly byval([9 x i8]) %dat ; P10BE-NEXT: lbz r5, 200(r1) ; P10BE-NEXT: stdx r3, 0, r4 ; P10BE-NEXT: lbz r4, 200(r1) -; P10BE-NEXT: ld r3, 192(r1) ; P10BE-NEXT: stb r5, 143(r1) ; P10BE-NEXT: bl callee_9 ; P10BE-NEXT: nop diff --git a/llvm/test/CodeGen/X86/fastcc-byval.ll b/llvm/test/CodeGen/X86/fastcc-byval.ll index aee07ca..920291a 100644 --- a/llvm/test/CodeGen/X86/fastcc-byval.ll +++ b/llvm/test/CodeGen/X86/fastcc-byval.ll @@ -16,8 +16,7 @@ define fastcc i32 @bar() nounwind { ; CHECK: ## %bb.0: ; CHECK-NEXT: subl $12, %esp ; CHECK-NEXT: movl $1, 8(%esp) -; CHECK-NEXT: movl 8(%esp), %eax -; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: movl $1, (%esp) ; CHECK-NEXT: calll _foo ; CHECK-NEXT: movl 8(%esp), %eax ; CHECK-NEXT: addl $12, %esp -- 2.7.4