From b36d462fac8a236b328b00abf053774ff7cd376a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Mar 2016 21:31:53 +0000 Subject: [PATCH] DAGCombiner: Turn truncate of a bitcasted vector to an extract On AMDGPU where operations i64 operations are often bitcasted to v2i32 and back, this pattern shows up regularly where it breaks some expected combines on i64, such as load width reducing. This fixes some test failures in a future commit when i64 loads are changed to promote. llvm-svn: 262397 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 ++++ llvm/test/CodeGen/AMDGPU/half.ll | 5 +- llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll | 93 ++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4120e79..cde38e4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7176,6 +7176,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } } + // Fold truncate of a bitcast of a vector to an extract of the low vector + // element. + // + // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, 0 + if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) { + SDValue VecSrc = N0.getOperand(0); + EVT SrcVT = VecSrc.getValueType(); + if (SrcVT.isVector() && SrcVT.getScalarType() == VT) { + SDLoc SL(N); + + EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, + VecSrc, DAG.getConstant(0, SL, IdxVT)); + } + } + // Simplify the operands using demanded-bits information. if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 76fbc6a..409b2ec 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -396,12 +396,11 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} - ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 +; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} ; GCN: v_cvt_f32_f16_e32 -; GCN-NOT: v_cvt_f32_f16_e32 +; GCN-NOT: v_cvt_f32_f16 ; GCN: v_cvt_f64_f32_e32 ; GCN: v_cvt_f64_f32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll new file mode 100644 index 0000000..20c49e2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -0,0 +1,93 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32: +; CHECK: buffer_load_dword v +; CHECK: buffer_store_dword v +define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in + %bc = bitcast <2 x i32> %ld to i64 + %trunc = trunc i64 %bc to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32: +; CHECK: buffer_load_dword v +; CHECK: buffer_store_dword v +define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) { + %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in + %bc = bitcast <3 x i32> %ld to i96 + %trunc = trunc i96 %bc to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32: +; CHECK: buffer_load_dword v +; CHECK: buffer_store_dword v +define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in + %bc = bitcast <4 x i32> %ld to i128 + %trunc = trunc i128 %bc to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; Don't want load width reduced in this case. +; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16: +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; CHECK: buffer_store_short [[VAL]] +define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { + %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in + %bc = bitcast <2 x i16> %ld to i32 + %trunc = trunc i32 %bc to i16 + store i16 %trunc, i16 addrspace(1)* %out + ret void +} + +; FIXME: Don't want load width reduced here. +; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16: +; CHECK: buffer_load_ushort [[VAL:v[0-9]+]] +; CHECK: buffer_store_short [[VAL]] +define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { + %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in + %bc = bitcast <4 x i16> %ld to i64 + %trunc = trunc i64 %bc to i16 + store i16 %trunc, i16 addrspace(1)* %out + ret void +} + +; FIXME: Don't want load width reduced in this case. +; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8: +; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]] +; CHECK: buffer_store_byte [[VAL]] +define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { + %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in + %bc = bitcast <2 x i8> %ld to i16 + %trunc = trunc i16 %bc to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8: +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; CHECK: buffer_store_byte [[VAL]] +define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { + %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in + %bc = bitcast <4 x i8> %ld to i32 + %trunc = trunc i32 %bc to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8: +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; CHECK: buffer_store_byte [[VAL]] +define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) { + %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in + %bc = bitcast <3 x i8> %ld to i24 + %trunc = trunc i24 %bc to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} -- 2.7.4