From 3ad5216ed88e303cb5d37864bb83b0eec81144af Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 26 Feb 2021 13:40:03 +0000 Subject: [PATCH] [AMDGPU] Better codegen for i64 bitreverse Differential Revision: https://reviews.llvm.org/D97547 --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 9 +- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 10 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 +- llvm/lib/Target/AMDGPU/SOPInstructions.td | 4 +- .../AMDGPU/GlobalISel/inst-select-bitreverse.mir | 65 +++ .../AMDGPU/GlobalISel/legalize-bitreverse.mir | 19 +- .../AMDGPU/GlobalISel/regbankselect-bitreverse.mir | 31 + llvm/test/CodeGen/AMDGPU/bitreverse.ll | 634 +++++++-------------- 10 files changed, 345 insertions(+), 434 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 931ce99..2925f70 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -935,10 +935,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32) .widenScalarToNextPow2(1, 32); + // S64 is only legal on SALU, and needs to be broken into 32-bit elements in + // RegBankSelect. getActionDefinitionsBuilder(G_BITREVERSE) - .legalFor({S32}) - .clampScalar(0, S32, S32) - .scalarize(0); + .legalFor({S32, S64}) + .clampScalar(0, S32, S64) + .scalarize(0) + .widenScalarToNextPow2(0); if (ST.has16BitInsts()) { getActionDefinitionsBuilder(G_BSWAP) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 59279b6..ae564d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2393,6 +2393,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case AMDGPU::G_CTPOP: + case AMDGPU::G_BITREVERSE: case AMDGPU::G_CTLZ_ZERO_UNDEF: case AMDGPU::G_CTTZ_ZERO_UNDEF: { const RegisterBank *DstBank = @@ -3607,10 +3608,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); break; } + case AMDGPU::G_BITREVERSE: case AMDGPU::G_BITCAST: case AMDGPU::G_INTTOPTR: case AMDGPU::G_PTRTOINT: - case AMDGPU::G_BITREVERSE: case AMDGPU::G_FABS: case AMDGPU::G_FNEG: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6353659..0eaec89 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -388,6 +388,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); // FIXME: This should be narrowed to i32, but that only happens if i64 is // illegal. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 46e213b..b8abd6d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5742,6 +5742,11 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, Inst.eraseFromParent(); continue; + case AMDGPU::S_BREV_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); + Inst.eraseFromParent(); + continue; + case AMDGPU::S_NOT_B64: splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); Inst.eraseFromParent(); @@ -6292,7 +6297,7 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, void SIInstrInfo::splitScalar64BitUnaryOp( SetVectorType &Worklist, MachineInstr &Inst, - unsigned Opcode) const { + unsigned Opcode, bool Swap) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6325,6 +6330,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp( Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); + if (Swap) + std::swap(DestSub0, DestSub1); + Register FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 49cfae5..b5a597c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -96,7 +96,8 @@ private: unsigned Opcode) const; void splitScalar64BitUnaryOp(SetVectorType &Worklist, - MachineInstr &Inst, unsigned Opcode) const; + MachineInstr &Inst, unsigned Opcode, + bool Swap = false) const; void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 7426af9..50725de 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -195,7 +195,9 @@ def : GCNPat < def S_BREV_B32 : SOP1_32 <"s_brev_b32", [(set i32:$sdst, (bitreverse i32:$src0))] >; -def S_BREV_B64 : SOP1_64 <"s_brev_b64">; +def S_BREV_B64 : SOP1_64 <"s_brev_b64", + [(set i64:$sdst, (bitreverse i64:$src0))] +>; let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bitreverse.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bitreverse.mir index a99e602..eaa9a37 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bitreverse.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bitreverse.mir @@ -51,3 +51,68 @@ body: | %1:vgpr(s32) = G_BITREVERSE %0 S_ENDPGM 0, implicit %1 ... + +--- +name: bitreverse_i64_ss +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: bitreverse_i64_ss + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; CHECK: [[S_BREV_B64_:%[0-9]+]]:sreg_64 = S_BREV_B64 [[COPY]] + ; CHECK: S_ENDPGM 0, implicit [[S_BREV_B64_]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_BITREVERSE %0 + S_ENDPGM 0, implicit %1 +... + +--- +name: bitreverse_i64_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: bitreverse_i64_vv + ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; CHECK: [[V_BFREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e64 [[COPY2]], implicit $exec + ; CHECK: [[V_BFREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e64 [[COPY1]], implicit $exec + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_BFREV_B32_e64_]], %subreg.sub0, [[V_BFREV_B32_e64_1]], %subreg.sub1 + ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %2:vgpr(s32), %3:vgpr(s32) = G_UNMERGE_VALUES %0(s64) + %4:vgpr(s32) = G_BITREVERSE %3 + %5:vgpr(s32) = G_BITREVERSE %2 + %1:vgpr(s64) = G_MERGE_VALUES %4(s32), %5(s32) + S_ENDPGM 0, implicit %1 +... + +--- +name: bitreverse_i64_vs +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: bitreverse_i64_vs + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; CHECK: [[V_BFREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e64 [[COPY2]], implicit $exec + ; CHECK: [[V_BFREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e64 [[COPY1]], implicit $exec + ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_BFREV_B32_e64_]], %subreg.sub0, [[V_BFREV_B32_e64_1]], %subreg.sub1 + ; CHECK: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %2:sgpr(s32), %3:sgpr(s32) = G_UNMERGE_VALUES %0(s64) + %4:vgpr(s32) = G_BITREVERSE %3 + %5:vgpr(s32) = G_BITREVERSE %2 + %1:vgpr(s64) = G_MERGE_VALUES %4(s32), %5(s32) + S_ENDPGM 0, implicit %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir index 87b468c..c365bdc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir @@ -136,11 +136,8 @@ body: | liveins: $vgpr0_vgpr1 ; CHECK-LABEL: name: bitreverse_s64 ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; CHECK: [[BITREVERSE:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV1]] - ; CHECK: [[BITREVERSE1:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV]] - ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BITREVERSE]](s32), [[BITREVERSE1]](s32) - ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; CHECK: [[BITREVERSE:%[0-9]+]]:_(s64) = G_BITREVERSE [[COPY]] + ; CHECK: $vgpr0_vgpr1 = COPY [[BITREVERSE]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_BITREVERSE %0 $vgpr0_vgpr1 = COPY %1 @@ -155,15 +152,9 @@ body: | ; CHECK-LABEL: name: bitreverse_v2s64 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) - ; CHECK: [[BITREVERSE:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV3]] - ; CHECK: [[BITREVERSE1:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV2]] - ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BITREVERSE]](s32), [[BITREVERSE1]](s32) - ; CHECK: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) - ; CHECK: [[BITREVERSE2:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV5]] - ; CHECK: [[BITREVERSE3:%[0-9]+]]:_(s32) = G_BITREVERSE [[UV4]] - ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BITREVERSE2]](s32), [[BITREVERSE3]](s32) - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) + ; CHECK: [[BITREVERSE:%[0-9]+]]:_(s64) = G_BITREVERSE [[UV]] + ; CHECK: [[BITREVERSE1:%[0-9]+]]:_(s64) = G_BITREVERSE [[UV1]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[BITREVERSE]](s64), [[BITREVERSE1]](s64) ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<2 x s64>) = G_BITREVERSE %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitreverse.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitreverse.mir index 15f4711..f5cb09e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitreverse.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitreverse.mir @@ -29,3 +29,34 @@ body: | %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_BITREVERSE %0 ... + +--- +name: bitreverse_i64_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: bitreverse_i64_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK: [[BITREVERSE:%[0-9]+]]:sgpr(s64) = G_BITREVERSE [[COPY]] + %0:_(s64) = COPY $sgpr0_sgpr1 + %1:_(s64) = G_BITREVERSE %0 +... + +--- +name: bitreverse_i64_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: bitreverse_i64_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[BITREVERSE:%[0-9]+]]:vgpr(s32) = G_BITREVERSE [[UV1]] + ; CHECK: [[BITREVERSE1:%[0-9]+]]:vgpr(s32) = G_BITREVERSE [[UV]] + ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[BITREVERSE]](s32), [[BITREVERSE1]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_BITREVERSE %0 +... diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index d6dc59a..99ccc1d 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT +; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL declare i32 @llvm.amdgcn.workitem.id.x() #1 @@ -41,6 +42,20 @@ define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) ; FLAT-NEXT: v_mov_b32_e32 v0, s0 ; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm +; +; GISEL-LABEL: s_brev_i16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dword s0, s[0:1], 0x2c +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: s_and_b32 s0, s0, 0xffff +; GISEL-NEXT: s_brev_b32 s0, s0 +; GISEL-NEXT: s_lshr_b32 s0, s0, 16 +; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: flat_store_short v[0:1], v2 +; GISEL-NEXT: s_endpgm %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 store i16 %brev, i16 addrspace(1)* %out ret void @@ -78,6 +93,22 @@ define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrsp ; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm +; +; GISEL-LABEL: v_brev_i16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: flat_load_ushort v0, v[0:1] +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 +; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: flat_store_short v[0:1], v2 +; GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 store i16 %brev, i16 addrspace(1)* %out @@ -108,6 +139,18 @@ define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) ; FLAT-NEXT: v_mov_b32_e32 v0, s0 ; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm +; +; GISEL-LABEL: s_brev_i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dword s0, s[0:1], 0x2c +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: s_brev_b32 s0, s0 +; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: flat_store_dword v[0:1], v2 +; GISEL-NEXT: s_endpgm %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 store i32 %brev, i32 addrspace(1)* %out ret void @@ -147,6 +190,24 @@ define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrsp ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm +; +; GISEL-LABEL: v_brev_i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: flat_load_dword v0, v[0:1] +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: flat_store_dword v[0:1], v2 +; GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -183,6 +244,20 @@ define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm +; +; GISEL-LABEL: s_brev_v2i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: s_brev_b32 s0, s0 +; GISEL-NEXT: s_brev_b32 s1, s1 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GISEL-NEXT: s_endpgm %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out ret void @@ -224,6 +299,25 @@ define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm +; +; GISEL-LABEL: v_brev_v2i32: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 +; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 +; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep @@ -235,93 +329,42 @@ define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { ; SI-LABEL: s_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8 -; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24 -; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8 -; SI-NEXT: v_alignbit_b32 v3, s3, s3, 24 -; SI-NEXT: v_bfi_b32 v4, s4, v1, v0 -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f -; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, s2, v4 -; SI-NEXT: v_and_b32_e32 v0, s2, v2 -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: v_and_b32_e32 v3, s2, v4 -; SI-NEXT: v_and_b32_e32 v2, s2, v2 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 -; SI-NEXT: s_mov_b32 s2, 0x33333333 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_and_b32_e32 v1, s2, v3 -; SI-NEXT: v_and_b32_e32 v0, s2, v2 -; SI-NEXT: s_mov_b32 s2, 0xcccccccc -; SI-NEXT: v_and_b32_e32 v3, s2, v3 -; SI-NEXT: v_and_b32_e32 v2, s2, v2 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: s_mov_b32 s2, 0x55555555 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_and_b32_e32 v1, s2, v3 -; SI-NEXT: v_and_b32_e32 v0, s2, v2 -; SI-NEXT: s_mov_b32 s2, 0xaaaaaaaa -; SI-NEXT: v_and_b32_e32 v3, s2, v3 -; SI-NEXT: v_and_b32_e32 v2, s2, v2 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_brev_b64 s[0:1], s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; FLAT-NEXT: v_mov_b32_e32 v0, 0x10203 -; FLAT-NEXT: s_mov_b32 s4, 0xf0f0f0f -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v2, 0, s2, v0 -; FLAT-NEXT: v_perm_b32 v4, 0, s3, v0 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: v_and_b32_e32 v1, s4, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s4, v4 -; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: s_mov_b32 s2, 0x33333333 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 -; FLAT-NEXT: s_mov_b32 s2, 0xcccccccc -; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] -; FLAT-NEXT: s_mov_b32 s2, 0x55555555 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 -; FLAT-NEXT: s_mov_b32 s2, 0xaaaaaaaa -; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: s_mov_b32 s3, 0xf000 -; FLAT-NEXT: s_mov_b32 s2, -1 -; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] +; FLAT-NEXT: v_mov_b32_e32 v0, s0 +; FLAT-NEXT: v_mov_b32_e32 v1, s1 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm +; +; GISEL-LABEL: s_brev_i64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GISEL-NEXT: s_endpgm %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 store i64 %brev, i64 addrspace(1)* %out ret void @@ -339,46 +382,11 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00ff -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 -; SI-NEXT: s_mov_b32 s6, 0xcccccccc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 -; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 -; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_bfi_b32 v2, s0, v0, v2 -; SI-NEXT: v_bfi_b32 v4, s0, v1, v3 -; SI-NEXT: v_and_b32_e32 v1, s1, v2 -; SI-NEXT: v_and_b32_e32 v0, s1, v4 -; SI-NEXT: v_and_b32_e32 v3, s2, v2 -; SI-NEXT: v_and_b32_e32 v2, s2, v4 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 -; SI-NEXT: s_mov_b32 s0, 0x55555555 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 -; SI-NEXT: v_and_b32_e32 v3, s6, v3 -; SI-NEXT: v_and_b32_e32 v2, s6, v2 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s0, v3 -; SI-NEXT: v_and_b32_e32 v0, s0, v2 -; SI-NEXT: v_and_b32_e32 v3, s1, v3 -; SI-NEXT: v_and_b32_e32 v2, s1, v2 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfrev_b32_e32 v2, v0 +; SI-NEXT: v_bfrev_b32_e32 v1, v1 +; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i64: @@ -386,49 +394,37 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: s_mov_b32 s3, 0x33333333 -; FLAT-NEXT: s_mov_b32 s6, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x10203 -; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_waitcnt vmcnt(0) -; FLAT-NEXT: v_perm_b32 v2, 0, v0, s0 -; FLAT-NEXT: v_perm_b32 v4, 0, v1, s0 -; FLAT-NEXT: v_and_b32_e32 v1, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s1, v4 -; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: s_mov_b32 s0, 0x55555555 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s6, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s6, v2 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] -; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: s_mov_b32 s6, -1 -; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 +; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 +; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm +; +; GISEL-LABEL: v_brev_i64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v4, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v3, s2 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 +; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 +; GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2] +; GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid %val = load i64, i64 addrspace(1)* %gep @@ -442,76 +438,15 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s8, 0xff00ff -; SI-NEXT: s_mov_b32 s9, 0x33333333 -; SI-NEXT: s_mov_b32 s10, 0xcccccccc -; SI-NEXT: s_mov_b32 s11, 0x55555555 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8 -; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24 -; SI-NEXT: v_bfi_b32 v3, s8, v1, v0 -; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8 -; SI-NEXT: v_alignbit_b32 v0, s3, s3, 24 -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f -; SI-NEXT: v_bfi_b32 v2, s8, v0, v2 -; SI-NEXT: s_mov_b32 s3, 0xf0f0f0f0 -; SI-NEXT: v_and_b32_e32 v0, s2, v2 -; SI-NEXT: v_and_b32_e32 v1, s2, v3 -; SI-NEXT: v_and_b32_e32 v2, s3, v2 -; SI-NEXT: v_and_b32_e32 v3, s3, v3 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 -; SI-NEXT: v_alignbit_b32 v4, s0, s0, 8 -; SI-NEXT: v_alignbit_b32 v5, s0, s0, 24 -; SI-NEXT: v_bfi_b32 v7, s8, v5, v4 -; SI-NEXT: v_alignbit_b32 v4, s1, s1, 8 -; SI-NEXT: v_alignbit_b32 v5, s1, s1, 24 -; SI-NEXT: v_bfi_b32 v6, s8, v5, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_and_b32_e32 v0, s9, v2 -; SI-NEXT: v_and_b32_e32 v1, s9, v3 -; SI-NEXT: v_and_b32_e32 v4, s2, v6 -; SI-NEXT: v_and_b32_e32 v5, s2, v7 -; SI-NEXT: v_and_b32_e32 v2, s10, v2 -; SI-NEXT: v_and_b32_e32 v3, s10, v3 -; SI-NEXT: v_and_b32_e32 v6, s3, v6 -; SI-NEXT: v_and_b32_e32 v7, s3, v7 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v6, v6, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v5 -; SI-NEXT: s_mov_b32 s12, 0xaaaaaaaa -; SI-NEXT: v_and_b32_e32 v0, s11, v2 -; SI-NEXT: v_and_b32_e32 v1, s11, v3 -; SI-NEXT: v_and_b32_e32 v4, s9, v6 -; SI-NEXT: v_and_b32_e32 v5, s9, v7 -; SI-NEXT: v_and_b32_e32 v2, s12, v2 -; SI-NEXT: v_and_b32_e32 v3, s12, v3 -; SI-NEXT: v_and_b32_e32 v6, s10, v6 -; SI-NEXT: v_and_b32_e32 v7, s10, v7 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 2 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v0, v6, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v5 -; SI-NEXT: v_and_b32_e32 v5, s11, v7 -; SI-NEXT: v_and_b32_e32 v4, s11, v0 -; SI-NEXT: v_and_b32_e32 v6, s12, v0 -; SI-NEXT: v_and_b32_e32 v7, s12, v7 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_or_b32_e32 v0, v6, v4 -; SI-NEXT: v_or_b32_e32 v1, v7, v5 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_brev_b64 s[2:3], s[2:3] +; SI-NEXT: s_brev_b64 s[0:1], s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -519,70 +454,33 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 -; FLAT-NEXT: v_mov_b32_e32 v4, 0x10203 -; FLAT-NEXT: s_mov_b32 s8, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s9, 0xcccccccc -; FLAT-NEXT: s_mov_b32 s10, 0x55555555 -; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v3, 0, s2, v4 -; FLAT-NEXT: v_perm_b32 v2, 0, s3, v4 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: v_and_b32_e32 v0, s8, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: v_perm_b32 v7, 0, s0, v4 -; FLAT-NEXT: v_perm_b32 v6, 0, s1, v4 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: s_mov_b32 s3, 0x33333333 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v4, s8, v6 -; FLAT-NEXT: v_and_b32_e32 v5, s8, v7 -; FLAT-NEXT: v_and_b32_e32 v2, s9, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s9, v3 -; FLAT-NEXT: v_and_b32_e32 v6, s2, v6 -; FLAT-NEXT: v_and_b32_e32 v7, s2, v7 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 4, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 -; FLAT-NEXT: s_mov_b32 s11, 0xaaaaaaaa -; FLAT-NEXT: v_and_b32_e32 v0, s10, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s10, v3 -; FLAT-NEXT: v_and_b32_e32 v4, s3, v6 -; FLAT-NEXT: v_and_b32_e32 v5, s3, v7 -; FLAT-NEXT: v_and_b32_e32 v2, s11, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s11, v3 -; FLAT-NEXT: v_and_b32_e32 v6, s9, v6 -; FLAT-NEXT: v_and_b32_e32 v7, s9, v7 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 2, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 -; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 -; FLAT-NEXT: v_and_b32_e32 v5, s10, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s10, v0 -; FLAT-NEXT: v_and_b32_e32 v6, s11, v0 -; FLAT-NEXT: v_and_b32_e32 v7, s11, v7 -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 -; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 -; FLAT-NEXT: v_or_b32_e32 v1, v7, v5 +; FLAT-NEXT: s_waitcnt lgkmcnt(0) +; FLAT-NEXT: s_brev_b64 s[2:3], s[2:3] +; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] +; FLAT-NEXT: v_mov_b32_e32 v0, s0 +; FLAT-NEXT: v_mov_b32_e32 v1, s1 +; FLAT-NEXT: v_mov_b32_e32 v2, s2 +; FLAT-NEXT: v_mov_b32_e32 v3, s3 ; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm +; +; GISEL-LABEL: s_brev_v2i64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] +; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v3, s3 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GISEL-NEXT: s_endpgm %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out ret void @@ -600,76 +498,13 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00ff -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 -; SI-NEXT: s_mov_b32 s8, 0xcccccccc -; SI-NEXT: s_mov_b32 s9, 0x55555555 -; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8 -; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 -; SI-NEXT: v_alignbit_b32 v5, v3, v3, 8 -; SI-NEXT: v_alignbit_b32 v6, v0, v0, 8 -; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8 -; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 -; SI-NEXT: v_bfi_b32 v2, s0, v2, v4 -; SI-NEXT: v_bfi_b32 v4, s0, v3, v5 -; SI-NEXT: v_bfi_b32 v6, s0, v0, v6 -; SI-NEXT: v_bfi_b32 v8, s0, v1, v7 -; SI-NEXT: v_and_b32_e32 v1, s1, v2 -; SI-NEXT: v_and_b32_e32 v0, s1, v4 -; SI-NEXT: v_and_b32_e32 v3, s2, v2 -; SI-NEXT: v_and_b32_e32 v2, s2, v4 -; SI-NEXT: v_and_b32_e32 v5, s1, v6 -; SI-NEXT: v_and_b32_e32 v4, s1, v8 -; SI-NEXT: v_and_b32_e32 v7, s2, v6 -; SI-NEXT: v_and_b32_e32 v6, s2, v8 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v7, v7, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v4 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 -; SI-NEXT: v_and_b32_e32 v5, s3, v7 -; SI-NEXT: v_and_b32_e32 v4, s3, v6 -; SI-NEXT: v_and_b32_e32 v3, s8, v3 -; SI-NEXT: v_and_b32_e32 v2, s8, v2 -; SI-NEXT: v_and_b32_e32 v7, s8, v7 -; SI-NEXT: v_and_b32_e32 v6, s8, v6 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 2 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v7, v7, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v4 -; SI-NEXT: v_and_b32_e32 v1, s9, v3 -; SI-NEXT: v_and_b32_e32 v0, s9, v2 -; SI-NEXT: v_and_b32_e32 v5, s9, v7 -; SI-NEXT: v_and_b32_e32 v4, s9, v6 -; SI-NEXT: v_and_b32_e32 v3, s10, v3 -; SI-NEXT: v_and_b32_e32 v2, s10, v2 -; SI-NEXT: v_and_b32_e32 v7, s10, v7 -; SI-NEXT: v_and_b32_e32 v6, s10, v6 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v5 -; SI-NEXT: v_or_b32_e32 v0, v6, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_bfrev_b32_e32 v4, v2 +; SI-NEXT: v_bfrev_b32_e32 v3, v3 +; SI-NEXT: v_bfrev_b32_e32 v2, v0 +; SI-NEXT: v_bfrev_b32_e32 v1, v1 +; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_v2i64: @@ -677,75 +512,41 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: s_mov_b32 s3, 0x33333333 -; FLAT-NEXT: s_mov_b32 s8, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x10203 -; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s9, 0x55555555 -; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) -; FLAT-NEXT: v_perm_b32 v6, 0, v0, s0 -; FLAT-NEXT: v_perm_b32 v4, 0, v3, s0 -; FLAT-NEXT: v_perm_b32 v2, 0, v2, s0 -; FLAT-NEXT: v_perm_b32 v8, 0, v1, s0 -; FLAT-NEXT: v_and_b32_e32 v1, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s1, v4 -; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 -; FLAT-NEXT: v_and_b32_e32 v5, s1, v6 -; FLAT-NEXT: v_and_b32_e32 v4, s1, v8 -; FLAT-NEXT: v_and_b32_e32 v7, s2, v6 -; FLAT-NEXT: v_and_b32_e32 v6, s2, v8 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 4, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 -; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s3, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s3, v6 -; FLAT-NEXT: v_and_b32_e32 v3, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s8, v2 -; FLAT-NEXT: v_and_b32_e32 v7, s8, v7 -; FLAT-NEXT: v_and_b32_e32 v6, s8, v6 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 2, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 -; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s9, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s9, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s9, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s9, v6 -; FLAT-NEXT: v_and_b32_e32 v3, s10, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s10, v2 -; FLAT-NEXT: v_and_b32_e32 v7, s10, v7 -; FLAT-NEXT: v_and_b32_e32 v6, s10, v6 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v1, v7, v5 -; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 -; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; FLAT-NEXT: v_bfrev_b32_e32 v4, v2 +; FLAT-NEXT: v_bfrev_b32_e32 v3, v3 +; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 +; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 +; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm +; +; GISEL-LABEL: v_brev_v2i64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_bfrev_b32_e32 v4, v1 +; GISEL-NEXT: v_bfrev_b32_e32 v5, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: v_bfrev_b32_e32 v6, v3 +; GISEL-NEXT: v_bfrev_b32_e32 v7, v2 +; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep @@ -769,6 +570,13 @@ define float @missing_truncate_promote_bitreverse(i32 %arg) { ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; FLAT-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: missing_truncate_promote_bitreverse: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 +; GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-NEXT: s_setpc_b64 s[30:31] bb: %tmp = trunc i32 %arg to i16 %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp) -- 2.7.4