From 57b5966dad858f30fd3bbdf42ba560ef9382f0c2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 10 Sep 2018 11:49:23 +0000 Subject: [PATCH] DAG: Handle odd vector sizes in calling conv splitting This already worked if only one register piece was used, but didn't if a type was split into multiple, unequal sized pieces. Fixes not splitting 3i16/v3f16 into two registers for AMDGPU. This will also allow fixing the ABI for 16-bit vectors in a future commit so that it's the same for all subtargets. llvm-svn: 341801 --- .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 29 ++++++++++-------- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 13 ++++---- llvm/test/CodeGen/AMDGPU/call-argument-types.ll | 27 +++++++++++++---- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 7 ++--- llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll | 2 +- llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll | 2 +- llvm/test/CodeGen/AMDGPU/function-args.ll | 13 ++++++-- llvm/test/CodeGen/AMDGPU/function-returns.ll | 8 ++--- llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 35 +++++++++++----------- 9 files changed, 81 insertions(+), 55 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index e0a3d54..4185514 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -701,33 +701,38 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT, NumIntermediates, RegisterVT); } - unsigned NumElements = ValueVT.getVectorNumElements(); assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!"); NumParts = NumRegs; // Silence a compiler warning. assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!"); + unsigned IntermediateNumElts = IntermediateVT.isVector() ? + IntermediateVT.getVectorNumElements() : 1; + // Convert the vector to the appropiate type if necessary. - unsigned DestVectorNoElts = - NumIntermediates * - (IntermediateVT.isVector() ? IntermediateVT.getVectorNumElements() : 1); + unsigned DestVectorNoElts = NumIntermediates * IntermediateNumElts; + EVT BuiltVectorTy = EVT::getVectorVT( *DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts); - if (Val.getValueType() != BuiltVectorTy) + MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + if (ValueVT != BuiltVectorTy) { + if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, BuiltVectorTy)) + Val = Widened; + Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val); + } // Split the vector into intermediate operands. SmallVector Ops(NumIntermediates); for (unsigned i = 0; i != NumIntermediates; ++i) { - if (IntermediateVT.isVector()) - Ops[i] = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val, - DAG.getConstant(i * (NumElements / NumIntermediates), DL, - TLI.getVectorIdxTy(DAG.getDataLayout()))); - else + if (IntermediateVT.isVector()) { + Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val, + DAG.getConstant(i * IntermediateNumElts, DL, IdxVT)); + } else { Ops[i] = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val, - DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); + DAG.getConstant(i, DL, IdxVT)); + } } // Split the intermediate operands into legal parts. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c9f0330..9359d53 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -718,9 +718,7 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, if (Size == 64) return MVT::i32; - if (Size == 16 && - Subtarget->has16BitInsts() && - isPowerOf2_32(VT.getVectorNumElements())) + if (Size == 16 && Subtarget->has16BitInsts()) return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; } @@ -741,9 +739,8 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, if (Size == 64) return 2 * NumElts; - // FIXME: Fails to break down as we want with v3. - if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) - return VT.getVectorNumElements() / 2; + if (Size == 16 && Subtarget->has16BitInsts()) + return (VT.getVectorNumElements() + 1) / 2; } return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); @@ -774,10 +771,10 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( // FIXME: We should fix the ABI to be the same on targets without 16-bit // support, but unless we can properly handle 3-vectors, it will be still be // inconsistent. - if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) { + if (Size == 16 && Subtarget->has16BitInsts()) { RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; IntermediateVT = RegisterVT; - NumIntermediates = NumElts / 2; + NumIntermediates = (NumElts + 1) / 2; return NumIntermediates; } } diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 2cedfe6..e98dd84 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -399,18 +399,35 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ret void } -; FIXME: materialize constant directly in VGPR +; GCN-LABEL: {{^}}test_call_external_void_func_v3f16: +; GFX9: buffer_load_dwordx2 v[0:1] +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { + %val = load <3 x half>, <3 x half> addrspace(1)* undef + call void @external_void_func_v3f16(<3 x half> %val) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v3i16_imm: -; GFX9-DAG: s_mov_b32 [[K01:s[0-9]+]], 0x20001 -; GFX9-DAG: s_mov_b32 [[K2:s[0-9]+]], 3 -; GFX9: v_mov_b32_e32 v0, [[K01]] -; GFX9: v_mov_b32_e32 v1, [[K2]] +; GFX9: v_mov_b32_e32 v0, 0x20001 +; GFX9: v_mov_b32_e32 v1, 3 ; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { call void @external_void_func_v3i16(<3 x i16> ) ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v3f16_imm: +; GFX9: v_mov_b32_e32 v0, 0x40003c00 +; GFX9: v_mov_b32_e32 v1, 0x4400 +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { + call void @external_void_func_v3f16(<3 x half> ) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v4i16: ; GFX9: buffer_load_dwordx2 v[0:1] ; GFX9-NOT: v0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 636028b..2945d7d 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -504,16 +504,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> a ; FIXME: Extra 4th component handled ; GCN-LABEL: {{^}}v_test_canonicalize_var_v3f16: ; GFX9: s_waitcnt -; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX9-NEXT: s_setpc_b64 -; VI-DAG: v_max_f16_sdwa [[CANON_ELT3:v[0-9]+]], v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_max_f16_e32 [[CANON_ELT2:v[0-9]+]], v1, v1 ; VI-DAG: v_max_f16_sdwa [[CANON_ELT1:v[0-9]+]], v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_max_f16_e32 [[CANON_ELT0:v[0-9]+]], v0, v0 +; VI-DAG: v_max_f16_e32 v1, v1, v1 ; VI-DAG: v_or_b32_e32 v0, [[CANON_ELT0]], [[CANON_ELT1]] -; VI-DAG: v_or_b32_e32 v1, [[CANON_ELT2]], [[CANON_ELT3]] + ; VI: s_setpc_b64 define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 { %canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val) diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index 23d0971..e06d93f 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -153,8 +153,8 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v3f16: ; GFX9-NNAN: ; %bb.0: ; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-NNAN-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NNAN-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 22773ac..2005730 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -154,8 +154,8 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v3f16: ; GFX9-NNAN: ; %bb.0: ; GFX9-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NNAN-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-NNAN-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NNAN-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-NNAN-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: test_fmin_legacy_ule_v3f16: diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 71541b2..e4a18bf 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -314,8 +314,17 @@ define void @void_func_v4i16(<4 x i16> %arg0) #0 { } ; GCN-LABEL: {{^}}void_func_v5i16: -; GCN-DAG: buffer_store_short v4, off, -; GCN-DAG: buffer_store_dwordx2 v[1:2], off +; CI: v_lshlrev_b32 +; CI: v_and_b32 +; CI: v_lshlrev_b32 +; CI: v_or_b32 +; CI: v_or_b32 +; CI-DAG: buffer_store_short v +; CI-DAG: buffer_store_dwordx2 v + +; GFX89-DAG: buffer_store_short v2, off, +; GFX89-DAG: buffer_store_dwordx2 v[0:1], off + define void @void_func_v5i16(<5 x i16> %arg0) #0 { store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef ret void diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 8d30b24..f0927de 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -317,13 +317,13 @@ define <4 x half> @v4f16_func_void() #0 { ret <4 x half> %val } +; FIXME: Mixing buffer and global ; FIXME: Should not scalarize ; GCN-LABEL: {{^}}v5i16_func_void: ; GFX9: buffer_load_dwordx2 v[0:1] -; GFX9: buffer_load_ushort v4 -; GFX9: v_lshrrev_b32_e32 v5, 16, v0 -; GFX9: v_lshrrev_b32_e32 v3, 16, v1 -; GCN: s_setpc_b64 +; GFX9-NEXT: global_load_short_d16 v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 define <5 x i16> @v5i16_func_void() #0 { %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index ed7b67f..03e8721 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -94,12 +94,10 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half ; GCN-LABEL: {{^}}v_mad_mix_v3f32: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { %src0.ext = fpext <3 x half> %src0 to <3 x float> @@ -149,11 +147,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt: ; GCN: s_waitcnt ; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { %src0.ext = fpext <3 x half> %src0 to <3 x float> @@ -246,15 +244,16 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; FIXME: Handling undef 4th component ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt: -; GFX9: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX9: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX9: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] - -; GFX9: v_cvt_f16_f32 -; GFX9: v_cvt_f16_f32 -; GFX9: v_cvt_f16_f32 -; GFX9: v_cvt_f16_f32 +; GCN: s_waitcnt +; GFX9-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> -- 2.7.4