From 6454391b3111993f277fb2570d28f59699b7dae2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 27 Dec 2022 10:37:52 -0500 Subject: [PATCH] AMDGPU/GlobalISel: Widen s1 SGPR constants during regbankselect To unambiguously interpret these as 32-bit SGPRs, we need to widen these to s32. This was selecting to a copy from a 64-bit SGPR to a 32-bit SGPR for wave64. --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 29 + .../CodeGen/AMDGPU/GlobalISel/bool-legalization.ll | 8 +- .../GlobalISel/br-constant-invalid-sgpr-copy.ll | 142 +++ .../AMDGPU/GlobalISel/inst-select-constant.mir | 50 + .../AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll | 14 +- llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll | 12 +- .../GlobalISel/regbankselect-amdgcn.kill.mir | 10 +- .../GlobalISel/regbankselect-amdgcn.wqm.demote.mir | 10 +- .../AMDGPU/GlobalISel/regbankselect-xor.mir | 5 +- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 39 +- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 527 +++++---- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 19 +- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 1185 ++++++++++---------- 13 files changed, 1122 insertions(+), 928 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 3b322d8..bffbb3d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2117,6 +2117,35 @@ void AMDGPURegisterBankInfo::applyMappingImpl( unsigned Opc = MI.getOpcode(); MachineRegisterInfo &MRI = OpdMapper.getMRI(); switch (Opc) { + case AMDGPU::G_CONSTANT: { + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + if (DstTy != LLT::scalar(1)) + break; + + const RegisterBank *DstBank = + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + if (DstBank == &AMDGPU::VCCRegBank) + break; + SmallVector DefRegs(OpdMapper.getVRegs(0)); + if (DefRegs.empty()) + DefRegs.push_back(DstReg); + + MachineIRBuilder B(MI); + B.setInsertPt(*MI.getParent(), ++MI.getIterator()); + + Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + LLVMContext &Ctx = B.getMF().getFunction().getContext(); + + MI.getOperand(0).setReg(NewDstReg); + uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); + MI.getOperand(1).setCImm( + ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal)); + + MRI.setRegBank(NewDstReg, *DstBank); + B.buildTrunc(DefRegs[0], NewDstReg); + return; + } case AMDGPU::G_PHI: { Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll index 7b3bc41..aeb6d7d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll @@ -68,7 +68,7 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; WAVE64: ; %bb.0: ; %entry ; WAVE64-NEXT: s_load_dword s0, s[0:1], 0x24 ; WAVE64-NEXT: s_waitcnt lgkmcnt(0) -; WAVE64-NEXT: s_xor_b32 s0, s0, -1 +; WAVE64-NEXT: s_xor_b32 s0, s0, 1 ; WAVE64-NEXT: s_and_b32 s0, s0, 1 ; WAVE64-NEXT: s_cmp_lg_u32 s0, 0 ; WAVE64-NEXT: s_cbranch_scc1 .LBB3_2 @@ -85,7 +85,7 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) { ; WAVE32: ; %bb.0: ; %entry ; WAVE32-NEXT: s_load_dword s0, s[0:1], 0x24 ; WAVE32-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-NEXT: s_xor_b32 s0, s0, -1 +; WAVE32-NEXT: s_xor_b32 s0, s0, 1 ; WAVE32-NEXT: s_and_b32 s0, s0, 1 ; WAVE32-NEXT: s_cmp_lg_u32 s0, 0 ; WAVE32-NEXT: s_cbranch_scc1 .LBB3_2 @@ -116,7 +116,7 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) { ; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; WAVE64-NEXT: s_waitcnt lgkmcnt(0) ; WAVE64-NEXT: s_and_b32 s0, s0, s1 -; WAVE64-NEXT: s_xor_b32 s0, s0, -1 +; WAVE64-NEXT: s_xor_b32 s0, s0, 1 ; WAVE64-NEXT: s_and_b32 s0, s0, 1 ; WAVE64-NEXT: s_cmp_lg_u32 s0, 0 ; WAVE64-NEXT: s_cbranch_scc1 .LBB4_2 @@ -134,7 +134,7 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) { ; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; WAVE32-NEXT: s_waitcnt lgkmcnt(0) ; WAVE32-NEXT: s_and_b32 s0, s0, s1 -; WAVE32-NEXT: s_xor_b32 s0, s0, -1 +; WAVE32-NEXT: s_xor_b32 s0, s0, 1 ; WAVE32-NEXT: s_and_b32 s0, s0, 1 ; WAVE32-NEXT: s_cmp_lg_u32 s0, 0 ; WAVE32-NEXT: s_cbranch_scc1 .LBB4_2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll new file mode 100644 index 0000000..351b023 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=WAVE64 %s +; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck -check-prefix=WAVE32 %s + +; This was mishandling the constant true and false values used as a +; scalar branch condition. + +define void @br_false() { +; WAVE64-LABEL: br_false: +; WAVE64: ; %bb.0: ; %.exit +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: .LBB0_1: ; %bb0 +; WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1 +; WAVE64-NEXT: s_mov_b32 s4, 1 +; WAVE64-NEXT: s_cmp_lg_u32 s4, 0 +; WAVE64-NEXT: s_cbranch_scc1 .LBB0_1 +; WAVE64-NEXT: ; %bb.2: ; %.exit5 +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: br_false: +; WAVE32: ; %bb.0: ; %.exit +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_waitcnt_vscnt null, 0x0 +; WAVE32-NEXT: .LBB0_1: ; %bb0 +; WAVE32-NEXT: ; =>This Inner Loop Header: Depth=1 +; WAVE32-NEXT: s_mov_b32 s4, 1 +; WAVE32-NEXT: s_cmp_lg_u32 s4, 0 +; WAVE32-NEXT: s_cbranch_scc1 .LBB0_1 +; WAVE32-NEXT: ; %bb.2: ; %.exit5 +; WAVE32-NEXT: s_setpc_b64 s[30:31] +.exit: + br label %bb0 + +bb0: + br i1 false, label %.exit5, label %bb0 + +.exit5: + ret void +} + +define void @br_true() { +; WAVE64-LABEL: br_true: +; WAVE64: ; %bb.0: ; %.exit +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: .LBB1_1: ; %bb0 +; WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1 +; WAVE64-NEXT: s_mov_b32 s4, 0 +; WAVE64-NEXT: s_cmp_lg_u32 s4, 0 +; WAVE64-NEXT: s_cbranch_scc1 .LBB1_1 +; WAVE64-NEXT: ; %bb.2: ; %.exit5 +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: br_true: +; WAVE32: ; %bb.0: ; %.exit +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_waitcnt_vscnt null, 0x0 +; WAVE32-NEXT: .LBB1_1: ; %bb0 +; WAVE32-NEXT: ; =>This Inner Loop Header: Depth=1 +; WAVE32-NEXT: s_mov_b32 s4, 0 +; WAVE32-NEXT: s_cmp_lg_u32 s4, 0 +; WAVE32-NEXT: s_cbranch_scc1 .LBB1_1 +; WAVE32-NEXT: ; %bb.2: ; %.exit5 +; WAVE32-NEXT: s_setpc_b64 s[30:31] +.exit: + br label %bb0 + +bb0: + br i1 true, label %.exit5, label %bb0 + +.exit5: + ret void +} + +define void @br_undef() { +; WAVE64-LABEL: br_undef: +; WAVE64: ; %bb.0: ; %.exit +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: .LBB2_1: ; %bb0 +; WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1 +; WAVE64-NEXT: ; implicit-def: $sgpr4 +; WAVE64-NEXT: s_and_b32 s4, s4, 1 +; WAVE64-NEXT: s_cmp_lg_u32 s4, 0 +; WAVE64-NEXT: s_cbranch_scc1 .LBB2_1 +; WAVE64-NEXT: ; %bb.2: ; %.exit5 +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: br_undef: +; WAVE32: ; %bb.0: ; %.exit +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_waitcnt_vscnt null, 0x0 +; WAVE32-NEXT: .LBB2_1: ; %bb0 +; WAVE32-NEXT: ; =>This Inner Loop Header: Depth=1 +; WAVE32-NEXT: ; implicit-def: $sgpr4 +; WAVE32-NEXT: s_and_b32 s4, s4, 1 +; WAVE32-NEXT: s_cmp_lg_u32 s4, 0 +; WAVE32-NEXT: s_cbranch_scc1 .LBB2_1 +; WAVE32-NEXT: ; %bb.2: ; %.exit5 +; WAVE32-NEXT: s_setpc_b64 s[30:31] +.exit: + br label %bb0 + +bb0: + br i1 undef, label %.exit5, label %bb0 + +.exit5: + ret void +} + +define void @br_poison() { +; WAVE64-LABEL: br_poison: +; WAVE64: ; %bb.0: ; %.exit +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: .LBB3_1: ; %bb0 +; WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1 +; WAVE64-NEXT: ; implicit-def: $sgpr4 +; WAVE64-NEXT: s_and_b32 s4, s4, 1 +; WAVE64-NEXT: s_cmp_lg_u32 s4, 0 +; WAVE64-NEXT: s_cbranch_scc1 .LBB3_1 +; WAVE64-NEXT: ; %bb.2: ; %.exit5 +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: br_poison: +; WAVE32: ; %bb.0: ; %.exit +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: s_waitcnt_vscnt null, 0x0 +; WAVE32-NEXT: .LBB3_1: ; %bb0 +; WAVE32-NEXT: ; =>This Inner Loop Header: Depth=1 +; WAVE32-NEXT: ; implicit-def: $sgpr4 +; WAVE32-NEXT: s_and_b32 s4, s4, 1 +; WAVE32-NEXT: s_cmp_lg_u32 s4, 0 +; WAVE32-NEXT: s_cbranch_scc1 .LBB3_1 +; WAVE32-NEXT: ; %bb.2: ; %.exit5 +; WAVE32-NEXT: s_setpc_b64 s[30:31] +.exit: + br label %bb0 + +bb0: + br i1 poison, label %.exit5, label %bb0 + +.exit5: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir index ebd6405..e968ac2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir @@ -584,3 +584,53 @@ body: | %7:vgpr(p999) = G_CONSTANT i64 18446744004990098135 S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7 ... + +--- +name: zext_sgpr_s1_to_sgpr_s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + ; WAVE64-LABEL: name: zext_sgpr_s1_to_sgpr_s32 + ; WAVE64: bb.0: + ; WAVE64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; WAVE64-NEXT: {{ $}} + ; WAVE64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; WAVE64-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]] + ; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], 1, implicit-def $scc + ; WAVE64-NEXT: $scc = COPY [[S_AND_B32_]] + ; WAVE64-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; WAVE64-NEXT: S_BRANCH %bb.2 + ; WAVE64-NEXT: {{ $}} + ; WAVE64-NEXT: bb.1: + ; WAVE64-NEXT: successors: %bb.2(0x80000000) + ; WAVE64-NEXT: {{ $}} + ; WAVE64-NEXT: {{ $}} + ; WAVE64-NEXT: bb.2: + ; WAVE32-LABEL: name: zext_sgpr_s1_to_sgpr_s32 + ; WAVE32: bb.0: + ; WAVE32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_]], 1, implicit-def $scc + ; WAVE32-NEXT: $scc = COPY [[S_AND_B32_]] + ; WAVE32-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc + ; WAVE32-NEXT: S_BRANCH %bb.2 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: bb.1: + ; WAVE32-NEXT: successors: %bb.2(0x80000000) + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: bb.2: + bb.0: + %0:sgpr(s1) = G_CONSTANT i1 true + %1:sgpr(s32) = G_ZEXT %0 + G_BRCOND %1, %bb.1 + G_BR %bb.2 + + bb.1: + + bb.2: + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 95ede54..8506efc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -48,6 +48,7 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s4, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s2, 56 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 @@ -63,19 +64,16 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s4, 0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; GCN-NEXT: s_branch .LBB2_3 -; GCN-NEXT: .LBB2_2: -; GCN-NEXT: s_mov_b32 s4, -1 -; GCN-NEXT: .LBB2_3: ; %Flow -; GCN-NEXT: s_xor_b32 s2, s4, -1 +; GCN-NEXT: .LBB2_2: ; %Flow +; GCN-NEXT: s_xor_b32 s2, s4, 1 ; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_cbranch_scc1 .LBB2_5 -; GCN-NEXT: ; %bb.4: ; %.zero +; GCN-NEXT: s_cbranch_scc1 .LBB2_4 +; GCN-NEXT: ; %bb.3: ; %.zero ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: .LBB2_5: ; %.exit +; GCN-NEXT: .LBB2_4: ; %.exit ; GCN-NEXT: s_endpgm %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) %cmp = icmp eq i32 %val, 56 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index c7e5931..2214d69 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -8,9 +8,9 @@ define amdgpu_kernel void @localize_constants(i1 %cond) { ; GFX9-LABEL: localize_constants: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s0, -1 +; GFX9-NEXT: s_mov_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_xor_b32 s1, s1, -1 +; GFX9-NEXT: s_xor_b32 s1, s1, 1 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 @@ -35,7 +35,7 @@ define amdgpu_kernel void @localize_constants(i1 %cond) { ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB0_2: ; %Flow -; GFX9-NEXT: s_xor_b32 s0, s0, -1 +; GFX9-NEXT: s_xor_b32 s0, s0, 1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB0_4 @@ -96,9 +96,9 @@ define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-LABEL: localize_globals: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s0, -1 +; GFX9-NEXT: s_mov_b32 s0, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_xor_b32 s1, s1, -1 +; GFX9-NEXT: s_xor_b32 s1, s1, 1 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 @@ -120,7 +120,7 @@ define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB1_2: ; %Flow -; GFX9-NEXT: s_xor_b32 s0, s0, -1 +; GFX9-NEXT: s_xor_b32 s0, s0, 1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.kill.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.kill.mir index d419a21..8f2cabc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.kill.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.kill.mir @@ -51,8 +51,9 @@ legalized: true body: | bb.0: ; CHECK-LABEL: name: kill_constant_true - ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.kill), [[COPY]](s1) %0:_(s1) = G_CONSTANT i1 true G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.kill), %0 @@ -65,8 +66,9 @@ legalized: true body: | bb.0: ; CHECK-LABEL: name: kill_constant_false - ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 false - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.kill), [[COPY]](s1) %0:_(s1) = G_CONSTANT i1 false G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.kill), %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir index babec48..20803f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir @@ -51,8 +51,9 @@ legalized: true body: | bb.0: ; CHECK-LABEL: name: wqm_demote_constant_true - ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) %0:_(s1) = G_CONSTANT i1 true G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 @@ -65,8 +66,9 @@ legalized: true body: | bb.0: ; CHECK-LABEL: name: wqm_demote_constant_false - ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 false - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C]](s32) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) %0:_(s1) = G_CONSTANT i1 false G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir index 930a1d4..4b6bd57 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir @@ -833,8 +833,9 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]] - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[C1]](s1) + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) ; CHECK-NEXT: [[XOR:%[0-9]+]]:vcc(s1) = G_XOR [[ICMP]], [[COPY2]] ; CHECK-NEXT: S_NOP 0, implicit [[XOR]](s1) %0:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 64ab731..2f422cd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -192,11 +192,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_sdiv_i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, -1 -; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[0:1], 0 +; CHECK-NEXT: s_mov_b32 s0, 1 ; CHECK-NEXT: s_cbranch_vccz .LBB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_ashr_i32 s6, s3, 31 @@ -326,12 +327,12 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_branch .LBB1_3 ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: .LBB1_3: ; %Flow -; CHECK-NEXT: s_xor_b32 s0, s1, -1 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 ; CHECK-NEXT: s_and_b32 s0, s0, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB1_5 @@ -1091,7 +1092,7 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], 0, v5, v[1:2] ; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000 -; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -1104,7 +1105,7 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v2, v8, v4, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v5 ; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CHECK-NEXT: s_bfe_i32 s4, 1, 0x10000 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_mov_b32_e32 v6, s4 @@ -1404,7 +1405,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s8, 1, 0x10000 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v5 @@ -1517,7 +1518,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v1, 0xcf800000, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v13, v1 -; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s4, 1, 0x10000 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CGP-NEXT: v_mov_b32_e32 v15, s4 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 @@ -1622,7 +1623,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] @@ -1632,7 +1633,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[4:5] ; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v11 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s4, 1, 0x10000 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v5, s4 @@ -1755,7 +1756,7 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], 0, v5, v[1:2] ; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb -; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -1768,7 +1769,7 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v2, v8, v4, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v5 ; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CHECK-NEXT: s_bfe_i32 s4, 1, 0x10000 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_mov_b32_e32 v6, s4 @@ -2068,7 +2069,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s8, 1, 0x10000 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v5 @@ -2181,7 +2182,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v1, 0xcf800000, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v13, v1 -; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s4, 1, 0x10000 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CGP-NEXT: v_mov_b32_e32 v15, s4 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 @@ -2286,7 +2287,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] @@ -2296,7 +2297,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[4:5] ; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v11 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s4, 1, 0x10000 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v5, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 91e435a..a40fe24 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -188,11 +188,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_srem_i64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, -1 -; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[0:1], 0 +; CHECK-NEXT: s_mov_b32 s0, 1 ; CHECK-NEXT: s_cbranch_vccz .LBB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_ashr_i32 s6, s3, 31 @@ -320,12 +321,12 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 -; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_branch .LBB1_3 ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: .LBB1_3: ; %Flow -; CHECK-NEXT: s_xor_b32 s0, s1, -1 +; CHECK-NEXT: s_xor_b32 s0, s0, 1 ; CHECK-NEXT: s_and_b32 s0, s0, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB1_5 @@ -1073,27 +1074,25 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], 0, v5, v[1:2] ; CHECK-NEXT: v_mov_b32_e32 v3, 0x1000 -; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v3 +; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v5, s6 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v3 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; CHECK-NEXT: v_mov_b32_e32 v8, s4 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -1382,7 +1381,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s8, 1, 0x10000 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v5 @@ -1471,159 +1470,153 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v5, v[1:2] -; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v10, v0 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v10, v0 -; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v6, vcc +; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v11, v6, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; CGP-NEXT: v_mov_b32_e32 v6, s8 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 -; CGP-NEXT: v_cndmask_b32_e64 v10, v6, v1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, v6, v1, s[4:5] ; CGP-NEXT: v_cvt_f32_u32_e32 v1, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v6 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, v9, v5 +; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v7 ; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v8, v5 -; CGP-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v0, vcc +; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v5 ; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CGP-NEXT: v_trunc_f32_e32 v6, v1 -; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v0 -; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_mov_b32_e32 v14, s4 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 -; CGP-NEXT: v_cndmask_b32_e32 v14, v14, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] -; CGP-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v7, v11, v1, vcc +; CGP-NEXT: v_trunc_f32_e32 v7, v1 +; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0 +; CGP-NEXT: v_cndmask_b32_e32 v16, v6, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v15, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], -1, v14, v[7:8] +; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v13, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; CGP-NEXT: v_cndmask_b32_e32 v8, v12, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v1, v15, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v13, v6 -; CGP-NEXT: v_mul_hi_u32 v14, v13, v0 -; CGP-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v14, v7 +; CGP-NEXT: v_mul_hi_u32 v16, v14, v0 +; CGP-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc ; CGP-NEXT: v_mul_hi_u32 v0, v15, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v16 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v15, v6 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 +; CGP-NEXT: v_mul_lo_u32 v16, v15, v7 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v7 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; CGP-NEXT: v_xor_b32_e32 v1, v8, v4 -; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] -; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v0 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v15, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v12, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v14, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v9, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], -1, v12, v[7:8] +; CGP-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v2, v9 +; CGP-NEXT: v_mul_lo_u32 v2, v14, v0 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v7 +; CGP-NEXT: v_xor_b32_e32 v13, v3, v9 +; CGP-NEXT: v_mul_hi_u32 v3, v12, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v2, v8 -; CGP-NEXT: v_mul_lo_u32 v2, v13, v0 -; CGP-NEXT: v_mul_lo_u32 v7, v11, v6 -; CGP-NEXT: v_xor_b32_e32 v12, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v3, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v13, v6 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v11, v6 +; CGP-NEXT: v_mul_lo_u32 v3, v14, v7 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v7, v14, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v12, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v12, v0 -; CGP-NEXT: v_xor_b32_e32 v9, v9, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v3, v13, v0 +; CGP-NEXT: v_mul_lo_u32 v7, v11, v2 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 +; CGP-NEXT: v_xor_b32_e32 v10, v10, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CGP-NEXT: v_mul_lo_u32 v8, v13, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CGP-NEXT: v_mul_hi_u32 v7, v11, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v11, 0 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v12, 0 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v0 ; CGP-NEXT: v_mov_b32_e32 v0, v3 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v6, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v7, v[0:1] ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v7, s6 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v12, v[7:8] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v5 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v5 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v10, s4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v9 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -1729,27 +1722,25 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], 0, v5, v[1:2] ; CHECK-NEXT: v_mov_b32_e32 v3, 0x12d8fb -; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 +; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v3 +; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] ; CHECK-NEXT: v_mov_b32_e32 v5, s6 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v3 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; CHECK-NEXT: v_mov_b32_e32 v8, s4 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -2038,7 +2029,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: s_bfe_i32 s8, 1, 0x10000 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v5 @@ -2127,159 +2118,153 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v5, v[1:2] -; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v10, v0 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v10, v0 -; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v6, vcc +; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v11, v6, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; CGP-NEXT: v_mov_b32_e32 v6, s8 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 -; CGP-NEXT: v_cndmask_b32_e64 v10, v6, v1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; CGP-NEXT: v_cndmask_b32_e64 v11, v6, v1, s[4:5] ; CGP-NEXT: v_cvt_f32_u32_e32 v1, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v6 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, v9, v5 +; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v7 ; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v8, v5 -; CGP-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v0, vcc +; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v5 ; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CGP-NEXT: v_trunc_f32_e32 v6, v1 -; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v0 -; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_mov_b32_e32 v14, s4 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 -; CGP-NEXT: v_cndmask_b32_e32 v14, v14, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] -; CGP-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v7, v11, v1, vcc +; CGP-NEXT: v_trunc_f32_e32 v7, v1 +; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0 +; CGP-NEXT: v_cndmask_b32_e32 v16, v6, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v15, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], -1, v14, v[7:8] +; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v13, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; CGP-NEXT: v_cndmask_b32_e32 v8, v12, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v1, v15, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v13, v6 -; CGP-NEXT: v_mul_hi_u32 v14, v13, v0 -; CGP-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v14, v7 +; CGP-NEXT: v_mul_hi_u32 v16, v14, v0 +; CGP-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc ; CGP-NEXT: v_mul_hi_u32 v0, v15, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v16 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v15, v6 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 +; CGP-NEXT: v_mul_lo_u32 v16, v15, v7 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v7 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; CGP-NEXT: v_xor_b32_e32 v1, v8, v4 -; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] -; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v0 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v15, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v12, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v14, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v9, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], -1, v12, v[7:8] +; CGP-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v2, v9 +; CGP-NEXT: v_mul_lo_u32 v2, v14, v0 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v7 +; CGP-NEXT: v_xor_b32_e32 v13, v3, v9 +; CGP-NEXT: v_mul_hi_u32 v3, v12, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v2, v8 -; CGP-NEXT: v_mul_lo_u32 v2, v13, v0 -; CGP-NEXT: v_mul_lo_u32 v7, v11, v6 -; CGP-NEXT: v_xor_b32_e32 v12, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v3, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v13, v6 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v11, v6 +; CGP-NEXT: v_mul_lo_u32 v3, v14, v7 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v7, v14, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v12, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v12, v0 -; CGP-NEXT: v_xor_b32_e32 v9, v9, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v3, v13, v0 +; CGP-NEXT: v_mul_lo_u32 v7, v11, v2 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 +; CGP-NEXT: v_xor_b32_e32 v10, v10, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CGP-NEXT: v_mul_hi_u32 v6, v10, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CGP-NEXT: v_mul_lo_u32 v8, v13, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CGP-NEXT: v_mul_hi_u32 v7, v11, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v11, 0 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v12, 0 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v0 ; CGP-NEXT: v_mov_b32_e32 v0, v3 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v6, v[0:1] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v7, v[0:1] ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v7, s6 -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v12, v[7:8] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v5 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v5 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; CGP-NEXT: v_mov_b32_e32 v10, s4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[4:5] +; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v9 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 0f26b64..93e8a43 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -186,10 +186,11 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_udiv_i64: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s4, 1 ; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3] -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_mov_b32 s5, -1 -; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: s_mov_b32 s9, -1 +; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] ; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2 ; CHECK-NEXT: s_cbranch_vccz .LBB1_2 @@ -316,12 +317,12 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_branch .LBB1_3 ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: .LBB1_3: ; %Flow -; CHECK-NEXT: s_xor_b32 s1, s5, -1 +; CHECK-NEXT: s_xor_b32 s1, s4, 1 ; CHECK-NEXT: s_and_b32 s1, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB1_5 @@ -1971,10 +1972,10 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 ; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 -; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s6, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s7, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s4, 1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s5, 1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s6, 1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s7, 1, 0x10000 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v18, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 261482c..61a9f26 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -183,10 +183,11 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_urem_i64: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s4, 1 ; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3] -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_mov_b32 s5, -1 -; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: s_mov_b32 s9, -1 +; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] ; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2 ; CHECK-NEXT: s_cbranch_vccz .LBB1_2 @@ -312,12 +313,12 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_branch .LBB1_3 ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: .LBB1_3: ; %Flow -; CHECK-NEXT: s_xor_b32 s1, s5, -1 +; CHECK-NEXT: s_xor_b32 s1, s4, 1 ; CHECK-NEXT: s_and_b32 s1, s1, 1 ; CHECK-NEXT: s_cmp_lg_u32 s1, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB1_5 @@ -973,121 +974,119 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s5, 0xffed2705 -; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 -; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 +; CHECK-NEXT: s_bfe_i32 s6, 1, 0x10000 ; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; CHECK-NEXT: v_mov_b32_e32 v4, s6 -; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v6, v6 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 +; CHECK-NEXT: v_trunc_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, v6, s5 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, s5 -; CHECK-NEXT: v_mul_hi_u32 v9, s5, v3 -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v8 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 -; CHECK-NEXT: v_mul_lo_u32 v11, v3, v7 -; CHECK-NEXT: v_mul_lo_u32 v12, v6, v7 -; CHECK-NEXT: v_mul_hi_u32 v13, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v6, v7 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v5, s5 ; CHECK-NEXT: v_mul_lo_u32 v7, v3, s5 ; CHECK-NEXT: v_mul_hi_u32 v8, s5, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, v6, s5 -; CHECK-NEXT: v_mul_lo_u32 v10, v6, v7 -; CHECK-NEXT: v_mul_hi_u32 v11, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v6, v7 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, v9, v3 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_lo_u32 v9, v3, v8 -; CHECK-NEXT: v_mul_lo_u32 v12, v6, v8 -; CHECK-NEXT: v_mul_hi_u32 v13, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v7 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 +; CHECK-NEXT: v_mul_lo_u32 v10, v3, v6 +; CHECK-NEXT: v_mul_lo_u32 v11, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v12, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v11 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v1, v3 -; CHECK-NEXT: v_mul_hi_u32 v8, v0, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v10, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v11, v0, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v3, s5 +; CHECK-NEXT: v_mul_hi_u32 v7, s5, v3 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, s5 +; CHECK-NEXT: v_mul_lo_u32 v9, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 +; CHECK-NEXT: v_sub_i32_e32 v8, vcc, v8, v3 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v7 +; CHECK-NEXT: v_mul_lo_u32 v11, v5, v7 +; CHECK-NEXT: v_mul_hi_u32 v12, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v10 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v1, v3 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 +; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 +; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, s4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v3, s4 ; CHECK-NEXT: v_mul_hi_u32 v3, s4, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_mul_lo_u32 v6, v6, s4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_lo_u32 v5, v5, s4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v3, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v3 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v4, v2 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %num, 1235195 ret i64 %result @@ -1097,68 +1096,66 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-LABEL: v_urem_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb +; GISEL-NEXT: s_mov_b32 s6, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, 0x12d8fb +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb -; GISEL-NEXT: v_madmk_f32 v6, v5, 0x4f800000, v7 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GISEL-NEXT: v_mov_b32_e32 v6, s4 -; GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GISEL-NEXT: s_sub_u32 s7, 0, 0x12d8fb +; GISEL-NEXT: v_madmk_f32 v7, v5, 0x4f800000, v6 +; GISEL-NEXT: s_subb_u32 s8, 0, 0 +; GISEL-NEXT: s_bfe_i32 s4, 1, 0x10000 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GISEL-NEXT: v_mov_b32_e32 v5, s4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GISEL-NEXT: s_sub_u32 s9, 0, 0x12d8fb -; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; GISEL-NEXT: s_subb_u32 s10, 0, 0 -; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v7 +; GISEL-NEXT: s_bfe_i32 s4, 1, 0x10000 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; GISEL-NEXT: v_mov_b32_e32 v10, s4 +; GISEL-NEXT: v_trunc_f32_e32 v8, v8 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_trunc_f32_e32 v10, v10 -; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v10 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v9 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, s9, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, s6, v8 -; GISEL-NEXT: v_mul_lo_u32 v14, s7, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, s6, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v17, s10, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, s9, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, s7, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v9 +; GISEL-NEXT: v_mul_lo_u32 v13, s7, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, s8, v7 +; GISEL-NEXT: v_mul_hi_u32 v15, s7, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, s10, v6 +; GISEL-NEXT: v_mul_hi_u32 v18, s9, v6 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v9, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v16 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v16 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v7, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v10, v16 +; GISEL-NEXT: v_mul_hi_u32 v15, v6, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v7, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v12 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v11 ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v12 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 @@ -1166,8 +1163,8 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v7, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 @@ -1181,41 +1178,42 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, s7, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, s6, v8 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v16 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v15, s10, v7 -; GISEL-NEXT: v_mul_hi_u32 v16, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v17, s6, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, s7, v7 +; GISEL-NEXT: v_mul_lo_u32 v13, s8, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, s7, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, s10, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, s7, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v8, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, s9, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, s9, v9 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v15 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v15 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v10, v15 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v15 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 @@ -1223,364 +1221,355 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v15 +; GISEL-NEXT: v_mul_hi_u32 v17, v6, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mov_b32_e32 v19, s11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; GISEL-NEXT: v_mov_b32_e32 v18, s12 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v0, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v3, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_lo_u32 v11, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v16, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v7, s[6:7], v13, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v19, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v18, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, s8 -; GISEL-NEXT: v_mul_hi_u32 v8, s8, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, v7, s6 +; GISEL-NEXT: v_mul_hi_u32 v7, s6, v7 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, s8 -; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, s8 -; GISEL-NEXT: v_mul_lo_u32 v10, v10, s8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, s6 +; GISEL-NEXT: v_mul_hi_u32 v6, s6, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, s6 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, s6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v12 -; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v3, v7, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v7 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v3, v6, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v6 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v5, v7, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v4 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v8, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v11, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v13, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s8, 0x12d8fb +; CGP-NEXT: s_mov_b32 s6, 0x12d8fb ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s5, -1, 0x10000 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; CGP-NEXT: s_mov_b32 s7, 0xffed2705 +; CGP-NEXT: s_bfe_i32 s4, 1, 0x10000 ; CGP-NEXT: v_cvt_f32_u32_e32 v7, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v8, 0 -; CGP-NEXT: s_bfe_i32 s7, -1, 0x10000 -; CGP-NEXT: s_bfe_i32 s9, -1, 0x10000 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_mov_b32_e32 v6, s4 -; CGP-NEXT: v_mov_b32_e32 v9, s5 +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 +; CGP-NEXT: v_mov_b32_e32 v5, s4 ; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v5 -; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v7 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_trunc_f32_e32 v10, v10 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v8 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v8, s6 +; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_mul_lo_u32 v10, v8, s7 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mul_lo_u32 v12, v10, s6 -; CGP-NEXT: v_mul_lo_u32 v13, v5, s6 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v5 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v7, s6 -; CGP-NEXT: v_mul_hi_u32 v16, s6, v7 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, v12, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CGP-NEXT: v_mul_lo_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; CGP-NEXT: v_mul_lo_u32 v16, v10, v15 -; CGP-NEXT: v_mul_hi_u32 v18, v7, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 -; CGP-NEXT: v_mul_lo_u32 v19, v7, v12 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v19 +; CGP-NEXT: v_mul_lo_u32 v11, v9, s7 +; CGP-NEXT: v_mul_lo_u32 v12, v6, s7 +; CGP-NEXT: v_mul_hi_u32 v13, s7, v6 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v10, v6 +; CGP-NEXT: v_mul_lo_u32 v14, v7, s7 +; CGP-NEXT: v_mul_hi_u32 v15, s7, v7 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v6, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; CGP-NEXT: v_mul_lo_u32 v15, v9, v14 +; CGP-NEXT: v_mul_hi_u32 v17, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v9, v14 +; CGP-NEXT: v_mul_lo_u32 v18, v6, v10 +; CGP-NEXT: v_mul_lo_u32 v19, v7, v11 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v19 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; CGP-NEXT: v_mul_lo_u32 v16, v5, v11 -; CGP-NEXT: v_mul_lo_u32 v18, v8, v11 -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 -; CGP-NEXT: v_mul_hi_u32 v14, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v17 -; CGP-NEXT: v_mul_lo_u32 v17, v10, v12 -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v18, v13 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; CGP-NEXT: v_mul_lo_u32 v15, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v17, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v18, v16 +; CGP-NEXT: v_mul_hi_u32 v18, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; CGP-NEXT: v_mul_hi_u32 v19, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v18 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v18 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v5, s6 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v7, s6 -; CGP-NEXT: v_mul_hi_u32 v14, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v15, v8, s6 -; CGP-NEXT: v_mul_lo_u32 v16, v8, v11 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mul_lo_u32 v18, v10, s6 -; CGP-NEXT: v_mul_lo_u32 v19, v10, v12 -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v5 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 -; CGP-NEXT: v_sub_i32_e32 v18, vcc, v18, v7 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; CGP-NEXT: v_mul_lo_u32 v18, v7, v14 -; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v18, v15 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v18, v8, v13 -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v6, s7 +; CGP-NEXT: v_mul_hi_u32 v12, s7, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v7, s7 +; CGP-NEXT: v_mul_hi_u32 v13, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v14, v8, s7 +; CGP-NEXT: v_mul_lo_u32 v15, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v16, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v10 +; CGP-NEXT: v_mul_lo_u32 v17, v9, s7 +; CGP-NEXT: v_mul_lo_u32 v18, v9, v11 +; CGP-NEXT: v_mul_hi_u32 v19, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v11 +; CGP-NEXT: v_sub_i32_e32 v14, vcc, v14, v6 +; CGP-NEXT: v_sub_i32_e32 v17, vcc, v17, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v6, v12 +; CGP-NEXT: v_mul_lo_u32 v17, v7, v13 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v19 +; CGP-NEXT: v_mul_lo_u32 v17, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v19, v6, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v13 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v13 +; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 +; CGP-NEXT: v_mul_hi_u32 v16, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v17, v10 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v17 -; CGP-NEXT: v_mul_lo_u32 v17, v10, v14 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v18, v11 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v18, v15 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; CGP-NEXT: v_mul_hi_u32 v19, v7, v14 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v19 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v19 +; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v17, s[4:5], v17, v19 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; CGP-NEXT: v_mov_b32_e32 v19, s7 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; CGP-NEXT: v_mov_b32_e32 v16, s9 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v14 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v18 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v1, v5 -; CGP-NEXT: v_mul_hi_u32 v13, v0, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v7 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v1, v6 +; CGP-NEXT: v_mul_hi_u32 v12, v0, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v1, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_mul_lo_u32 v15, v0, v8 -; CGP-NEXT: v_mul_lo_u32 v17, v1, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v14, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v15, v1, v8 +; CGP-NEXT: v_mul_hi_u32 v16, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; CGP-NEXT: v_mul_lo_u32 v17, v2, v9 +; CGP-NEXT: v_mul_lo_u32 v18, v3, v9 +; CGP-NEXT: v_mul_hi_u32 v19, v2, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; CGP-NEXT: v_mul_lo_u32 v11, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v13, v3, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_hi_u32 v11, v2, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v3, v10 -; CGP-NEXT: v_add_i32_e64 v5, s[6:7], v17, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v7, s[6:7], v13, v7 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v18 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v18, v7 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v16 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v19 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v17, v11 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v6, s6 +; CGP-NEXT: v_mul_hi_u32 v6, s6, v6 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v5, s8 -; CGP-NEXT: v_mul_hi_u32 v5, s8, v5 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v12, v7, s8 -; CGP-NEXT: v_mul_hi_u32 v7, s8, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v8, v8, s8 -; CGP-NEXT: v_mul_lo_u32 v10, v10, s8 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5 +; CGP-NEXT: v_mul_lo_u32 v13, v7, s6 +; CGP-NEXT: v_mul_hi_u32 v7, s6, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_mul_lo_u32 v8, v8, s6 +; CGP-NEXT: v_mul_lo_u32 v9, v9, s6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v12 -; CGP-NEXT: v_subb_u32_e64 v10, s[6:7], v3, v7, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v13 +; CGP-NEXT: v_subb_u32_e64 v9, s[6:7], v3, v7, s[4:5] ; CGP-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v7 ; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] ; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8 -; CGP-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[6:7] +; CGP-NEXT: v_cndmask_b32_e64 v6, v5, v6, s[6:7] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v6, v19, v7, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; CGP-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc ; CGP-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v0, v4 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; CGP-NEXT: v_sub_i32_e32 v12, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v7, v4 -; CGP-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v11, v5, v11, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, v10, v4 +; CGP-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 +; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v13, s[4:5] +; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 ; CGP-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, ret <2 x i64> %result @@ -2435,17 +2424,17 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 ; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 @@ -2455,212 +2444,206 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16 -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 +; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 -; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 -; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s6, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s7, -1, 0x10000 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v18, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_mov_b32_e32 v15, s4 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v20, v18 -; GISEL-NEXT: v_mov_b32_e32 v19, s5 -; GISEL-NEXT: v_mul_hi_u32 v16, v8, v16 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_mov_b32_e32 v16, s6 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: s_bfe_i32 s4, 1, 0x10000 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mov_b32_e32 v16, s4 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v2 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v2 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; GISEL-NEXT: v_mul_lo_u32 v10, v6, v4 ; GISEL-NEXT: v_mul_lo_u32 v14, v8, v4 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 -; GISEL-NEXT: v_mul_lo_u32 v18, v11, v9 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mov_b32_e32 v18, s7 +; GISEL-NEXT: v_mul_hi_u32 v15, v6, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v11, v9 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v19, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v7, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v14, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v2 -; GISEL-NEXT: v_mul_hi_u32 v8, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v4 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v17, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v17, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v18, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v17, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v10, 0, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v13, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v2 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v13, 0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v1, v7 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v8 -; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v4, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], 0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11 -; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11 +; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], 0, v5, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v5, s[6:7], 0, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v0, vcc, 0, v0, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v5, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v16, v7, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v8, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v2, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v13, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v15, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_24bit: -- 2.7.4