From: Matt Arsenault Date: Wed, 6 Jan 2021 23:45:12 +0000 (-0500) Subject: GlobalISel: Add combine for G_UREM by power of 2 X-Git-Tag: llvmorg-13-init~1804 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1f9b6ef91ffd8ea487aa083d146c7568e7243457;p=platform%2Fupstream%2Fllvm.git GlobalISel: Add combine for G_UREM by power of 2 Really I want this in the legalizer, but this is a start. --- diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 432587e..0d240e9 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -400,6 +400,9 @@ public: /// Check if operand \p OpIdx is undef. bool matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx); + /// Check if operand \p OpIdx is known to be a power of 2. + bool matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI, unsigned OpIdx); + /// Erase \p MI bool eraseInst(MachineInstr &MI); @@ -459,6 +462,9 @@ public: bool matchPtrAddZero(MachineInstr &MI); bool applyPtrAddZero(MachineInstr &MI); + /// Combine G_UREM x, (known power of 2) to an add and bitmasking. + bool applySimplifyURemByPow2(MachineInstr &MI); + bool matchCombineInsertVecElts(MachineInstr &MI, SmallVectorImpl &MatchInfo); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 32aec75..e352e49 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -296,6 +296,13 @@ def binop_left_to_zero: GICombineRule< (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 1); }]) >; +def urem_pow2_to_mask : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_UREM):$root, + [{ return Helper.matchOperandIsKnownToBeAPowerOfTwo(*${root}, 2); }]), + (apply [{ return Helper.applySimplifyURemByPow2(*${root}); }]) +>; + // Fold (x op 0) - > 0 def binop_right_to_zero: GICombineRule< (defs root:$root), @@ -560,7 +567,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero, def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p]>; def known_bits_simplifications : GICombineGroup<[ - redundant_and, redundant_sext_inreg, redundant_or]>; + redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask]>; def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index abc23da..bbcf32a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2580,6 +2580,12 @@ bool CombinerHelper::matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx) { getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI); } +bool CombinerHelper::matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI, + unsigned OpIdx) { + MachineOperand &MO = MI.getOperand(OpIdx); + return isKnownToBeAPowerOfTwo(MO.getReg(), MRI, KB); +} + bool CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); Builder.setInstr(MI); @@ -3130,6 +3136,22 @@ bool CombinerHelper::applyPtrAddZero(MachineInstr &MI) { return true; } +/// The second source operand is known to be a power of 2. +bool CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) { + Register DstReg = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Pow2Src1 = MI.getOperand(2).getReg(); + LLT Ty = MRI.getType(DstReg); + Builder.setInstrAndDebugLoc(MI); + + // Fold (urem x, pow2) -> (and x, pow2-1) + auto NegOne = Builder.buildConstant(Ty, -1); + auto Add = Builder.buildAdd(Ty, Pow2Src1, NegOne); + Builder.buildAnd(DstReg, Src0, Add); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir new file mode 100644 index 0000000..f92e32d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-urem-pow-2.mir @@ -0,0 +1,156 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: urem_s32_var_const0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: urem_s32_var_const0 + ; GCN: liveins: $vgpr0 + ; GCN: %var:_(s32) = COPY $vgpr0 + ; GCN: %const:_(s32) = G_CONSTANT i32 0 + ; GCN: %rem:_(s32) = G_UREM %var, %const + ; GCN: $vgpr0 = COPY %rem(s32) + %var:_(s32) = COPY $vgpr0 + %const:_(s32) = G_CONSTANT i32 0 + %rem:_(s32) = G_UREM %var, %const + $vgpr0 = COPY %rem +... + +--- +name: urem_s32_var_const1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: urem_s32_var_const1 + ; GCN: liveins: $vgpr0 + ; GCN: %const:_(s32) = G_CONSTANT i32 1 + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD %const, [[C]] + ; GCN: $vgpr0 = COPY [[ADD]](s32) + %var:_(s32) = COPY $vgpr0 + %const:_(s32) = G_CONSTANT i32 1 + %rem:_(s32) = G_UREM %var, %const + $vgpr0 = COPY %rem +... + +--- +name: urem_s32_var_const2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: urem_s32_var_const2 + ; GCN: liveins: $vgpr0 + ; GCN: %const:_(s32) = G_CONSTANT i32 1 + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD %const, [[C]] + ; GCN: $vgpr0 = COPY [[ADD]](s32) + %var:_(s32) = COPY $vgpr0 + %const:_(s32) = G_CONSTANT i32 1 + %rem:_(s32) = G_UREM %var, %const + $vgpr0 = COPY %rem +... + +--- +name: urem_s32_var_shl1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GCN-LABEL: name: urem_s32_var_shl1 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: %var:_(s32) = COPY $vgpr0 + ; GCN: %shift_amt:_(s32) = COPY $vgpr1 + ; GCN: %one:_(s32) = G_CONSTANT i32 1 + ; GCN: %one_bit:_(s32) = G_SHL %one, %shift_amt(s32) + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD %one_bit, [[C]] + ; GCN: %rem:_(s32) = G_AND %var, [[ADD]] + ; GCN: $vgpr0 = COPY %rem(s32) + %var:_(s32) = COPY $vgpr0 + %shift_amt:_(s32) = COPY $vgpr1 + %one:_(s32) = G_CONSTANT i32 1 + %one_bit:_(s32) = G_SHL %one, %shift_amt + %rem:_(s32) = G_UREM %var, %one_bit + $vgpr0 = COPY %rem +... + +--- +name: urem_s64_var_shl1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GCN-LABEL: name: urem_s64_var_shl1 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2 + ; GCN: %var:_(s64) = COPY $vgpr0_vgpr1 + ; GCN: %shiftamt:_(s32) = COPY $vgpr2 + ; GCN: %one:_(s64) = G_CONSTANT i64 1 + ; GCN: %one_bit:_(s64) = G_SHL %one, %shiftamt(s32) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; GCN: [[ADD:%[0-9]+]]:_(s64) = G_ADD %one_bit, [[C]] + ; GCN: %rem:_(s64) = G_AND %var, [[ADD]] + ; GCN: $vgpr0_vgpr1 = COPY %rem(s64) + %var:_(s64) = COPY $vgpr0_vgpr1 + %shiftamt:_(s32) = COPY $vgpr2 + %one:_(s64) = G_CONSTANT i64 1 + %one_bit:_(s64) = G_SHL %one, %shiftamt + %rem:_(s64) = G_UREM %var, %one_bit + $vgpr0_vgpr1 = COPY %rem +... + +--- +name: urem_v2s32_var_shl1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: urem_v2s32_var_shl1 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: %var:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GCN: %shift_amt:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GCN: %one:_(s32) = G_CONSTANT i32 1 + ; GCN: %one_vec:_(<2 x s32>) = G_BUILD_VECTOR %one(s32), %one(s32) + ; GCN: %one_bit:_(<2 x s32>) = G_SHL %one_vec, %shift_amt(<2 x s32>) + ; GCN: %rem:_(<2 x s32>) = G_UREM %var, %one_bit + ; GCN: $vgpr0_vgpr1 = COPY %rem(<2 x s32>) + %var:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %shift_amt:_(<2 x s32>) = COPY $vgpr2_vgpr3 + %one:_(s32) = G_CONSTANT i32 1 + %one_vec:_(<2 x s32>) = G_BUILD_VECTOR %one, %one + %one_bit:_(<2 x s32>) = G_SHL %one_vec, %shift_amt + %rem:_(<2 x s32>) = G_UREM %var, %one_bit + $vgpr0_vgpr1 = COPY %rem +... + +--- +name: urem_v2s16_var_const4_build_vector_trunc +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GCN-LABEL: name: urem_v2s16_var_const4_build_vector_trunc + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: %var:_(<2 x s16>) = COPY $vgpr0 + ; GCN: %four:_(s32) = G_CONSTANT i32 4 + ; GCN: %four_vec:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %four(s32), %four(s32) + ; GCN: %rem:_(<2 x s16>) = G_UREM %var, %four_vec + ; GCN: $vgpr0 = COPY %rem(<2 x s16>) + %var:_(<2 x s16>) = COPY $vgpr0 + %shift_amt:_(<2 x s16>) = COPY $vgpr1 + %four:_(s32) = G_CONSTANT i32 4 + %four_vec:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %four, %four + %rem:_(<2 x s16>) = G_UREM %var, %four_vec + $vgpr0 = COPY %rem +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll index 7850d42..e6bee5e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -207,24 +207,8 @@ define i32 @v_urem_i32_pow2k_denom(i32 %num) { ; CHECK-LABEL: v_urem_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1000 -; CHECK-NEXT: v_mov_b32_e32 v1, 0xfffff000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v2, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 12, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CHECK-NEXT: s_add_i32 s4, 0x1000, -1 +; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i32 %num, 4096 ret i32 %result @@ -266,42 +250,9 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-LABEL: v_urem_v2i32_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_mov_b32_e32 v2, 0x1000 -; CGP-NEXT: s_mov_b32 s5, 0x4f7ffffe -; CGP-NEXT: s_movk_i32 s6, 0xf000 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, s4 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v3, s5, v3 -; CGP-NEXT: v_mul_f32_e32 v4, s5, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v5, s6, v3 -; CGP-NEXT: v_mul_lo_u32 v6, s6, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v4, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_lshlrev_b32_e32 v3, 12, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v4, 12, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CGP-NEXT: s_add_i32 s4, 0x1000, -1 +; CGP-NEXT: v_and_b32_e32 v0, s4, v0 +; CGP-NEXT: v_and_b32_e32 v1, s4, v1 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i32> %num, ret <2 x i32> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 2e1292d..60084a0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -949,131 +949,13 @@ define i64 @v_urem_i64_pow2k_denom(i64 %num) { ; CHECK-LABEL: v_urem_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_movk_i32 s6, 0xf000 -; CHECK-NEXT: s_movk_i32 s7, 0x1000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v3, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v4, s6, v3 -; CHECK-NEXT: v_mul_lo_u32 v5, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v2, v4 -; CHECK-NEXT: v_mul_lo_u32 v9, v3, v4 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_addc_u32_e64 v5, s[4:5], v3, v4, vcc -; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v4, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, s6, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v2, v6 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v6 -; CHECK-NEXT: v_mul_hi_u32 v11, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v5, v5, v6 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v9, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v9 -; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, v0, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, v1, v3 -; CHECK-NEXT: v_mul_hi_u32 v8, v0, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, s7, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, 0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, s7, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v3, s7, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v1, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], 0, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, s7, v0 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, s7, v4 -; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: s_add_u32 s4, 0x1000, -1 +; CHECK-NEXT: s_cselect_b32 s5, 1, 0 +; CHECK-NEXT: s_and_b32 s5, s5, 1 +; CHECK-NEXT: s_cmp_lg_u32 s5, 0 +; CHECK-NEXT: s_addc_u32 s5, 0, -1 +; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_and_b32_e32 v1, s5, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %num, 4096 ret i64 %result @@ -1344,253 +1226,21 @@ define <2 x i64> @v_urem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-LABEL: v_urem_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s8, 0xf000 -; CGP-NEXT: s_movk_i32 s10, 0x1000 -; CGP-NEXT: v_mov_b32_e32 v6, v4 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v6 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v8, s8, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s8, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v13, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v6, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v19, v5, v9 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v18 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v16 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v15 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v8, vcc -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CGP-NEXT: v_mul_lo_u32 v8, s8, v4 -; CGP-NEXT: v_mul_lo_u32 v11, -1, v4 -; CGP-NEXT: v_mul_hi_u32 v12, s8, v4 -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5] -; CGP-NEXT: v_add_i32_e64 v7, s[6:7], v7, v9 -; CGP-NEXT: v_mul_lo_u32 v9, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 -; CGP-NEXT: v_mul_hi_u32 v15, s8, v5 -; CGP-NEXT: v_mul_lo_u32 v16, s8, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v10, v8 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v10, v8 -; CGP-NEXT: v_mul_lo_u32 v19, s8, v13 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v16 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v9 -; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v14, v19 -; CGP-NEXT: v_mul_hi_u32 v19, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v9 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v14, v15 -; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v12 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v19 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v11 -; CGP-NEXT: v_mul_hi_u32 v19, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v11, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v12 -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v17, v14 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v15, v8 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v9, s[8:9], v11, v9 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v14, s[8:9], v14, v18 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[8:9] -; CGP-NEXT: v_add_i32_e64 v8, s[8:9], v8, v19 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v14, s[6:7], v17, v14 -; CGP-NEXT: v_add_i32_e64 v15, s[6:7], v15, v18 -; CGP-NEXT: v_add_i32_e64 v16, s[6:7], v16, v19 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 -; CGP-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v9, s[6:7], v9, v16 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v12, s[6:7], v15, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v11, v14 -; CGP-NEXT: v_add_i32_e64 v10, s[6:7], v10, v12 -; CGP-NEXT: v_add_i32_e64 v11, s[6:7], v13, v11 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v10, vcc -; CGP-NEXT: v_addc_u32_e64 v7, vcc, v7, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v1, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v0, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_mul_lo_u32 v12, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v13, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 -; CGP-NEXT: v_mul_lo_u32 v15, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v16, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v15, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_mul_lo_u32 v10, s10, v4 -; CGP-NEXT: v_mul_lo_u32 v12, 0, v4 -; CGP-NEXT: v_mul_hi_u32 v4, s10, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v11, s10, v5 -; CGP-NEXT: v_mul_lo_u32 v13, 0, v5 -; CGP-NEXT: v_mul_hi_u32 v5, s10, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v6, s10, v6 -; CGP-NEXT: v_mul_lo_u32 v7, s10, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s10, v2 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11 -; CGP-NEXT: v_subb_u32_e64 v7, s[6:7], v1, v5, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v5 -; CGP-NEXT: v_cmp_le_u32_e64 s[6:7], s10, v0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[6:7] -; CGP-NEXT: v_cmp_le_u32_e64 s[6:7], 0, v6 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; CGP-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; CGP-NEXT: v_subrev_i32_e32 v8, vcc, s10, v2 -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s10, v8 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; CGP-NEXT: v_subrev_i32_e32 v9, vcc, s10, v0 -; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s10, v9 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; CGP-NEXT: v_subrev_i32_e32 v13, vcc, s10, v8 -; CGP-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; CGP-NEXT: v_subrev_i32_e32 v16, vcc, s10, v9 -; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v11, v15, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v14, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[4:5] -; CGP-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CGP-NEXT: s_movk_i32 s4, 0x1000 +; CGP-NEXT: s_add_u32 s5, s4, -1 +; CGP-NEXT: s_cselect_b32 s6, 1, 0 +; CGP-NEXT: s_and_b32 s6, s6, 1 +; CGP-NEXT: s_cmp_lg_u32 s6, 0 +; CGP-NEXT: s_addc_u32 s6, 0, -1 +; CGP-NEXT: s_add_u32 s4, s4, -1 +; CGP-NEXT: s_cselect_b32 s7, 1, 0 +; CGP-NEXT: v_and_b32_e32 v0, s5, v0 +; CGP-NEXT: s_and_b32 s5, s7, 1 +; CGP-NEXT: v_and_b32_e32 v1, s6, v1 +; CGP-NEXT: s_cmp_lg_u32 s5, 0 +; CGP-NEXT: s_addc_u32 s5, 0, -1 +; CGP-NEXT: v_and_b32_e32 v2, s4, v2 +; CGP-NEXT: v_and_b32_e32 v3, s5, v3 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, ret <2 x i64> %result