From b5edd522d195447e3ae16f95c5821762edbf815a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 19 Jan 2022 16:16:20 -0500 Subject: [PATCH] AMDGPU/GlobalISel: Do not create readfirstlane with non-s32 type We should probably handle any 32-bit type here, but the intrinsic definition and selection pattern currently do not. Avoids a few lit tests failures when switched on by default. --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 12 ++++++ llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 2 +- .../AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll | 45 +++++++++++++++++++++- 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 7a626fa..2bdceee 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -70,6 +70,18 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler { const SIRegisterInfo *TRI = static_cast(MRI.getTargetRegisterInfo()); if (TRI->isSGPRReg(MRI, PhysReg)) { + LLT Ty = MRI.getType(ExtReg); + LLT S32 = LLT::scalar(32); + if (Ty != S32) { + // FIXME: We should probably support readfirstlane intrinsics with all + // legal 32-bit types. + assert(Ty.getSizeInBits() == 32); + if (Ty.isPointer()) + ExtReg = MIRBuilder.buildPtrToInt(S32, ExtReg).getReg(0); + else + ExtReg = MIRBuilder.buildBitcast(S32, ExtReg).getReg(0); + } + auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {MRI.getType(ExtReg)}, false) .addReg(ExtReg); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index b6c6607..2b70665 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -99,7 +99,7 @@ def CC_SI_SHADER : CallingConv<[ def RetCC_SI_Shader : CallingConv<[ CCIfType<[i1, i16], CCIfExtend>>, - CCIfType<[i32, i16] , CCAssignToReg<[ + CCIfType<[i32, i16, v2i16] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll index 2da014d..552e72e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -stop-after=irtranslator -global-isel %s -o - | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=irtranslator %s -o - | FileCheck %s ; Check that we correctly skip over disabled inputs define amdgpu_ps void @disabled_input(float inreg %arg0, float %psinput0, float %psinput1) #1 { @@ -109,6 +109,49 @@ define amdgpu_ps { i32, i32 } @sgpr_struct_return_i32_i32(i32 %vgpr0, i32 %vgpr1 ret { i32, i32 } %value } +define amdgpu_ps i8 addrspace(3)* @sgpr_return_p3i8(i8 addrspace(3)* %vgpr) { + ; CHECK-LABEL: name: sgpr_return_p3i8 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[PTRTOINT]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INT]](s32) + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ret i8 addrspace(3)* %vgpr +} + +define amdgpu_ps i8 addrspace(1)* @sgpr_return_p1i8(i8 addrspace(1)* %vgpr) { + ; CHECK-LABEL: name: sgpr_return_p1i8 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p1) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INT]](s32) + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[INT1]](s32) + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ret i8 addrspace(1)* %vgpr +} + +define amdgpu_ps <2 x i16> @sgpr_return_v2i16(<2 x i16> %vgpr) { + ; CHECK-LABEL: name: sgpr_return_v2i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INT]](s32) + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0 + ret <2 x i16> %vgpr +} + declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0 attributes #0 = { nounwind } -- 2.7.4