def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
AssemblerPredicate<"FeatureGFX9Insts">;
-def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarryInsts()">,
+def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
AssemblerPredicate<"FeatureAddNoCarryInsts">;
-def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">,
+def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">,
AssemblerPredicate<"!FeatureAddNoCarryInsts">;
def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
bit ret = !if(a, !if(b, 1, 0), 0);
}
+def PatGenMode {
+ int NoPattern = 0;
+ int Pattern = 1;
+}
+
class VOPProfile <list<ValueType> _ArgVT> {
field list<ValueType> ArgVT = _ArgVT;
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasSDWA9 = HasExt;
+ field int NeedPatGen = PatGenMode.NoPattern;
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
let HasSDWA9 = 0;
}
+class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.Pattern> : VOPProfile <p.ArgVT> {
+ let NeedPatGen = mode;
+}
+
def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
let SubtargetPredicate = isGCN;
}
-include "VOPInstructions.td"
include "SOPInstructions.td"
+include "VOPInstructions.td"
include "SMInstructions.td"
include "FLATInstructions.td"
include "BUFInstructions.td"
defm : SelectPat <f16, V_CNDMASK_B32_e64>;
defm : SelectPat <f32, V_CNDMASK_B32_e64>;
+let AddedComplexity = 1 in {
def : GCNPat <
- (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
+ (i32 (add (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)), i32:$val)),
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
+}
def : GCNPat <
- (i16 (add (i16 (trunc (ctpop i32:$popcnt))), i16:$val)),
+ (i16 (add (i16 (trunc (getDivergentFrag<ctpop>.ret i32:$popcnt))), i16:$val)),
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
"$sdst, $src0, $src1", pattern
>;
+class UniformBinFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0, node:$src1),
+ (Op $src0, $src1),
+ [{ return !N->isDivergent(); }]
+>;
+
let Defs = [SCC] in { // Carry out goes to SCC
let isCommutable = 1 in {
def S_ADD_U32 : SOP2_32 <"s_add_u32">;
def S_ADD_I32 : SOP2_32 <"s_add_i32",
- [(set i32:$sdst, (add SSrc_b32:$src0, SSrc_b32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<add> SSrc_b32:$src0, SSrc_b32:$src1))]
>;
} // End isCommutable = 1
def S_SUB_U32 : SOP2_32 <"s_sub_u32">;
def S_SUB_I32 : SOP2_32 <"s_sub_i32",
- [(set i32:$sdst, (sub SSrc_b32:$src0, SSrc_b32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<sub> SSrc_b32:$src0, SSrc_b32:$src1))]
>;
let Uses = [SCC] in { // Carry in comes from SCC
let isCommutable = 1 in {
def S_ADDC_U32 : SOP2_32 <"s_addc_u32",
- [(set i32:$sdst, (adde (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+ [(set i32:$sdst, (UniformBinFrag<adde> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
} // End isCommutable = 1
def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
- [(set i32:$sdst, (sube (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+ [(set i32:$sdst, (UniformBinFrag<sube> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
} // End Uses = [SCC]
let isCommutable = 1 in {
def S_MIN_I32 : SOP2_32 <"s_min_i32",
- [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))]
>;
def S_MIN_U32 : SOP2_32 <"s_min_u32",
- [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))]
>;
def S_MAX_I32 : SOP2_32 <"s_max_i32",
- [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))]
>;
def S_MAX_U32 : SOP2_32 <"s_max_u32",
- [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))]
>;
} // End isCommutable = 1
} // End Defs = [SCC]
let Defs = [SCC] in {
let isCommutable = 1 in {
def S_AND_B32 : SOP2_32 <"s_and_b32",
- [(set i32:$sdst, (and i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, i32:$src1))]
>;
def S_AND_B64 : SOP2_64 <"s_and_b64",
- [(set i64:$sdst, (and i64:$src0, i64:$src1))]
+ [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, i64:$src1))]
>;
def S_OR_B32 : SOP2_32 <"s_or_b32",
- [(set i32:$sdst, (or i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, i32:$src1))]
>;
def S_OR_B64 : SOP2_64 <"s_or_b64",
- [(set i64:$sdst, (or i64:$src0, i64:$src1))]
+ [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, i64:$src1))]
>;
def S_XOR_B32 : SOP2_32 <"s_xor_b32",
- [(set i32:$sdst, (xor i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<xor> i32:$src0, i32:$src1))]
>;
def S_XOR_B64 : SOP2_64 <"s_xor_b64",
- [(set i64:$sdst, (xor i64:$src0, i64:$src1))]
+ [(set i64:$sdst, (UniformBinFrag<xor> i64:$src0, i64:$src1))]
>;
def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
let AddedComplexity = 1 in {
let Defs = [SCC] in {
+// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
- [(set i32:$sdst, (shl i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<shl> i32:$src0, i32:$src1))]
>;
def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
[(set i64:$sdst, (shl i64:$src0, i32:$src1))]
>;
def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
- [(set i32:$sdst, (srl i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<srl> i32:$src0, i32:$src1))]
>;
def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
[(set i64:$sdst, (srl i64:$src0, i32:$src1))]
>;
def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
- [(set i32:$sdst, (sra i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<sra> i32:$src0, i32:$src1))]
>;
def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
[(set i64:$sdst, (sra i64:$src0, i32:$src1))]
} // End Defs = [SCC]
def S_BFM_B32 : SOP2_32 <"s_bfm_b32",
- [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
+ [(set i32:$sdst, (UniformBinFrag<AMDGPUbfm> i32:$src0, i32:$src1))]>;
def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">;
+
+// TODO: S_MUL_I32 require V_MUL_LO_I32 from VOP3 change
def S_MUL_I32 : SOP2_32 <"s_mul_i32",
[(set i32:$sdst, (mul i32:$src0, i32:$src1))]> {
let isCommutable = 1;
let renamedInGFX9 = GFX9Renamed in {
- def _e32 : VOP2_Pseudo <opName, P>,
+ def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
let renamedInGFX9 = GFX9Renamed in {
let SchedRW = [Write32Bit, WriteSALU] in {
let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
- def _e32 : VOP2_Pseudo <opName, P>,
+ def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
// VOP2 Instructions
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN in {
+let SubtargetPredicate = isGCN, Predicates = [isGCN] in {
defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
-def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">;
+def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
let isCommutable = 1 in {
defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>;
-defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>;
-defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>;
-defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>;
-defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>;
+defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_i24>;
+defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>;
+defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_u24>;
+defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>;
defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
-defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_I32_I32_I32>;
-defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_I32_I32_I32>;
-defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_I32_I32_I32>;
-defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_I32_I32_I32>;
+defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
+defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
+defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
+defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>;
defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">;
defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">;
defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">;
-defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_I32_I32_I32>;
-defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_I32_I32_I32>;
-defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_I32_I32_I32>;
+defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
+defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
+defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
let Constraints = "$vdst = $src2", DisableEncoding="$src2",
isConvertibleToThreeAddress = 1 in {
defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
}
-def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">;
+def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
// No patterns so that the scalar instructions are always selected.
// The scalar versions will be replaced with vector when needed later.
// These are special and do not read the exec mask.
let isConvergent = 1, Uses = []<Register> in {
def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
- [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
+ [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
- [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">;
+ [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
} // End $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>;
defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>;
-} // End SubtargetPredicate = isGCN
+} // End SubtargetPredicate = isGCN, Predicates = [isGCN]
def : GCNPat<
(AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
>;
// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI in {
+let SubtargetPredicate = isSICI, Predicates = [isSICI] in {
defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>;
defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
let isCommutable = 1 in {
defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
-defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
-defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
} // End isCommutable = 1
-} // End let SubtargetPredicate = SICI
+} // End let SubtargetPredicate = SICI, Predicates = [isSICI]
+
+class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
+ GCNPat<
+ (getDivergentFrag<Op>.ret Inst.Pfl.Src0VT:$src0, Inst.Pfl.Src1VT:$src1),
+ !if(!cast<Commutable_REV>(Inst).IsOrig,
+ (Inst $src0, $src1),
+ (Inst $src1, $src0)
+ )
+ >;
+
+let AddedComplexity = 1 in {
+ def : DivergentBinOp<srl, V_LSHRREV_B32_e64>;
+ def : DivergentBinOp<sra, V_ASHRREV_I32_e64>;
+ def : DivergentBinOp<shl, V_LSHLREV_B32_e64>;
+}
+
+let SubtargetPredicate = HasAddNoCarryInsts in {
+ def : DivergentBinOp<add, V_ADD_U32_e32>;
+ def : DivergentBinOp<sub, V_SUB_U32_e32>;
+ def : DivergentBinOp<sub, V_SUBREV_U32_e32>;
+}
+
+
+def : DivergentBinOp<add, V_ADD_I32_e32>;
+
+def : DivergentBinOp<add, V_ADD_I32_e64>;
+def : DivergentBinOp<sub, V_SUB_I32_e32>;
+
+def : DivergentBinOp<sub, V_SUBREV_I32_e32>;
+
+def : DivergentBinOp<srl, V_LSHRREV_B32_e32>;
+def : DivergentBinOp<sra, V_ASHRREV_I32_e32>;
+def : DivergentBinOp<shl, V_LSHLREV_B32_e32>;
+def : DivergentBinOp<adde, V_ADDC_U32_e32>;
+def : DivergentBinOp<sube, V_SUBB_U32_e32>;
+
+class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
+ GCNPat<
+ (getDivergentFrag<Op>.ret i64:$src0, i64:$src1),
+ (REG_SEQUENCE VReg_64,
+ (Inst
+ (i32 (EXTRACT_SUBREG $src0, sub0)),
+ (i32 (EXTRACT_SUBREG $src1, sub0))
+ ), sub0,
+ (Inst
+ (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (i32 (EXTRACT_SUBREG $src1, sub1))
+ ), sub1
+ )
+ >;
+
+def : divergent_i64_BinOp <and, V_AND_B32_e32>;
+def : divergent_i64_BinOp <or, V_OR_B32_e32>;
+def : divergent_i64_BinOp <xor, V_XOR_B32_e32>;
let SubtargetPredicate = Has16BitInsts in {
let DecoderNamespace = "DPP";
}
+class getNumNodeArgs<SDPatternOperator Op> {
+ SDNode N = !cast<SDNode>(Op);
+ SDTypeProfile TP = N.TypeProfile;
+ int ret = TP.NumOperands;
+}
+
+
+class getDivergentFrag<SDPatternOperator Op> {
+
+ int NumSrcArgs = getNumNodeArgs<Op>.ret;
+ PatFrag ret = PatFrag <
+ !if(!eq(NumSrcArgs, 1),
+ (ops node:$src0),
+ !if(!eq(NumSrcArgs, 2),
+ (ops node:$src0, node:$src1),
+ (ops node:$src0, node:$src1, node:$src2))),
+ !if(!eq(NumSrcArgs, 1),
+ (Op $src0),
+ !if(!eq(NumSrcArgs, 2),
+ (Op $src0, $src1),
+ (Op $src0, $src1, $src2))),
+ [{ return N->isDivergent(); }]
+ >;
+}
+
+class VOPPatGen<SDPatternOperator Op, VOPProfile P> {
+
+ PatFrag Operator = getDivergentFrag < Op >.ret;
+
+ dag Ins = !foreach(tmp, P.Ins32, !subst(ins, Operator,
+ !subst(P.Src0RC32, P.Src0VT,
+ !subst(P.Src1RC32, P.Src1VT, tmp))));
+
+
+ dag Outs = !foreach(tmp, P.Outs32, !subst(outs, set,
+ !subst(P.DstRC, P.DstVT, tmp)));
+
+ list<dag> ret = [!con(Outs, (set Ins))];
+}
+
+class VOPPatOrNull<SDPatternOperator Op, VOPProfile P> {
+ list<dag> ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen<Op, P>.ret, []);
+}
+
include "VOPCInstructions.td"
include "VOP1Instructions.td"
include "VOP2Instructions.td"
; FUNC-LABEL: {{^}}v_add_i32:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
-; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, [[B]], [[A]]
+; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, [[A]], [[B]]
; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x()
; GCN-LABEL: {{^}}work_item_info:
; GCN-NOT: v0
-; GCN: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
+; GCN: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, v{{[0-9]+}}, v0
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
entry:
; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
-; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
-; SI-NEXT: v_lshr_b32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
-
-; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
-; VI-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
+; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
+; GCN-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
; GCN: [[BFE]]
; GCN: [[SHL]]
; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
-; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
-; SI-NEXT: v_ashr_i32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
-
-; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
-; VI-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
+; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
+; GCN-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
; GCN: [[BFE]]
; GCN: [[SHL]]
; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0
; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]]
-; GCN: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT1]], [[MIDRESULT2]]
+; GCN: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT2]], [[MIDRESULT1]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT: v_lshl_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshr_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bzhi32_d1_indexzext:
; GCN-LABEL: {{^}}v_fabs_fold_self_v2f16:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
-; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; CI: v_cvt_f32_f16_e32
-; CI: v_cvt_f32_f16_e32
-; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshrrev_b32_e32 [[VREG:v[0-9]+]], 16, v{{[0-9]+}}
+; CI: v_cvt_f32_f16_e32 [[NORM:v[0-9]+]], [[VREG]]
+; CI: v_cvt_f32_f16_e64 [[ABS:v[0-9]+]], {{\|}}[[VREG]]{{\|}}
+; CI: v_mul_f32_e32 v{{[0-9]+}}, [[ABS]], [[NORM]]
; CI: v_cvt_f16_f32
; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_cvt_f16_f32
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[AND]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]]
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[AND]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]]
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
-; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], [[COPY_VAL]], v[[LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
-; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], [[COPY_VAL]], v[[HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
}
; VI-LABEL: {{^}}dpp_test1:
-; VI-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
-; VI-NOOPT: v_mov_b32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}
; VI-NEXT: s_nop 0
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_dpp v2, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
; GCN-LABEL: v_shl_i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshl_b64 v[5:6], v[0:1], v4
-; GCN-NEXT: v_lshl_b64 v[7:8], v[2:3], v4
+; GCN-NEXT: v_lshl_b64 v[5:6], v[2:3], v4
; GCN-NEXT: v_sub_i32_e32 v9, vcc, 64, v4
; GCN-NEXT: v_subrev_i32_e32 v11, vcc, 64, v4
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
-; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN-NEXT: v_lshl_b64 v[7:8], v[0:1], v4
; GCN-NEXT: v_lshr_b64 v[9:10], v[0:1], v9
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11
-; GCN-NEXT: v_or_b32_e32 v7, v7, v9
-; GCN-NEXT: v_or_b32_e32 v8, v8, v10
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; GCN-NEXT: v_mov_b32_e32 v0, v5
-; GCN-NEXT: v_mov_b32_e32 v1, v6
+; GCN-NEXT: v_or_b32_e32 v6, v6, v10
+; GCN-NEXT: v_or_b32_e32 v5, v5, v9
+; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GCN-NEXT: v_cndmask_b32_e32 v6, v1, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc
+; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4
+; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = shl i128 %lhs, %rhs
ret i128 %shl
; GCN-LABEL: v_lshr_i128_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_lshr_b64 v[5:6], v[2:3], v4
-; GCN-NEXT: v_lshr_b64 v[7:8], v[0:1], v4
+; GCN-NEXT: v_lshr_b64 v[5:6], v[0:1], v4
; GCN-NEXT: v_sub_i32_e32 v9, vcc, 64, v4
; GCN-NEXT: v_subrev_i32_e32 v11, vcc, 64, v4
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
-; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN-NEXT: v_lshr_b64 v[7:8], v[2:3], v4
; GCN-NEXT: v_lshl_b64 v[9:10], v[2:3], v9
; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v11
-; GCN-NEXT: v_or_b32_e32 v7, v7, v9
-; GCN-NEXT: v_or_b32_e32 v8, v8, v10
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v2, v5
-; GCN-NEXT: v_mov_b32_e32 v3, v6
+; GCN-NEXT: v_or_b32_e32 v6, v6, v10
+; GCN-NEXT: v_or_b32_e32 v5, v5, v9
+; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GCN-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v8, vcc
+; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4
+; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
+
%shl = lshr i128 %lhs, %rhs
ret i128 %shl
}
; GCN-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
; GCN-NEXT: v_lshl_b64 v[9:10], v[2:3], v10
; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], v11
-; GCN-NEXT: v_or_b32_e32 v7, v7, v9
; GCN-NEXT: v_or_b32_e32 v8, v8, v10
+; GCN-NEXT: v_or_b32_e32 v7, v7, v9
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v8
; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8
; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v18
-; GCN-NEXT: v_or_b32_e32 v20, v16, v18
-; GCN-NEXT: v_or_b32_e32 v21, v17, v19
+; GCN-NEXT: v_or_b32_e32 v20, v17, v19
+; GCN-NEXT: v_or_b32_e32 v21, v16, v18
; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v12
; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v16
; GCN-NEXT: v_lshl_b64 v[18:19], v[6:7], v12
-; GCN-NEXT: v_or_b32_e32 v16, v18, v16
; GCN-NEXT: v_or_b32_e32 v17, v19, v17
+; GCN-NEXT: v_or_b32_e32 v16, v18, v16
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
-; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
+; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
-; GCN-NEXT: v_or_b32_e32 v14, v12, v14
; GCN-NEXT: v_or_b32_e32 v15, v13, v15
+; GCN-NEXT: v_or_b32_e32 v14, v12, v14
; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[8:9]
; GCN-NEXT: v_subrev_i32_e32 v18, vcc, 64, v8
; GCN-NEXT: v_lshl_b64 v[8:9], v[0:1], v8
; GCN-NEXT: v_lshl_b64 v[12:13], v[4:5], v12
; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v18
; GCN-NEXT: s_and_b64 vcc, s[6:7], s[10:11]
-; GCN-NEXT: v_cndmask_b32_e32 v18, v1, v21, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v19, v0, v20, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v18, v1, v20, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v19, v0, v21, vcc
; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[12:13]
; GCN-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[6:7]
; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v16, s[6:7]
; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8
; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8
; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18
-; GCN-NEXT: v_or_b32_e32 v20, v16, v18
-; GCN-NEXT: v_or_b32_e32 v21, v17, v19
+; GCN-NEXT: v_or_b32_e32 v20, v17, v19
+; GCN-NEXT: v_or_b32_e32 v21, v16, v18
; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v12
; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v16
; GCN-NEXT: v_lshr_b64 v[18:19], v[4:5], v12
-; GCN-NEXT: v_or_b32_e32 v16, v18, v16
; GCN-NEXT: v_or_b32_e32 v17, v19, v17
+; GCN-NEXT: v_or_b32_e32 v16, v18, v16
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
-; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
+; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
-; GCN-NEXT: v_or_b32_e32 v14, v12, v14
; GCN-NEXT: v_or_b32_e32 v15, v13, v15
+; GCN-NEXT: v_or_b32_e32 v14, v12, v14
; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[8:9]
; GCN-NEXT: v_subrev_i32_e32 v18, vcc, 64, v8
; GCN-NEXT: v_lshr_b64 v[8:9], v[2:3], v8
; GCN-NEXT: v_lshr_b64 v[12:13], v[6:7], v12
; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v18
; GCN-NEXT: s_and_b64 vcc, s[6:7], s[10:11]
-; GCN-NEXT: v_cndmask_b32_e32 v18, v3, v21, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v19, v2, v20, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v18, v3, v20, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v19, v2, v21, vcc
; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[12:13]
; GCN-NEXT: v_cndmask_b32_e64 v17, v7, v17, s[6:7]
; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v16, s[6:7]
; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8
; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8
; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18
-; GCN-NEXT: v_or_b32_e32 v20, v16, v18
-; GCN-NEXT: v_or_b32_e32 v21, v17, v19
+; GCN-NEXT: v_or_b32_e32 v20, v17, v19
+; GCN-NEXT: v_or_b32_e32 v21, v16, v18
; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v12
; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v16
; GCN-NEXT: v_lshr_b64 v[18:19], v[4:5], v12
-; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_or_b32_e32 v19, v19, v17
+; GCN-NEXT: v_or_b32_e32 v18, v18, v16
; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
-; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
+; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
-; GCN-NEXT: v_or_b32_e32 v14, v12, v14
; GCN-NEXT: v_or_b32_e32 v15, v13, v15
+; GCN-NEXT: v_or_b32_e32 v14, v12, v14
; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[8:9]
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9
; GCN-NEXT: s_and_b64 s[6:7], s[6:7], s[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v21, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v20, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7]
; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[12:13]
; GCN-NEXT: v_ashr_i64 v[8:9], v[2:3], v8
; GCN-NEXT: v_ashrrev_i32_e32 v20, 31, v3
; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]]
; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CI: v_lshl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; FIXME: Why is this commuted only sometimes?
; GCN-LABEL: {{^}}i32_fastcc_i32_i32:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0
+; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64
define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0
+; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GCN: s_mov_b32 s5, s32
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:24
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0
+; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0
-; CIVI: v_add_{{i|u}}32_e32 v0, vcc, [[LOAD_0]], v0
-; CIVI: v_add_{{i|u}}32_e32 v0, vcc, [[LOAD_1]], v0
+; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
+; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
+; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
}
; FUNC-LABEL: {{^}}test_sub_i16:
-; SI: v_subrev_i32_e32 v{{[0-9]+}}, vcc,
+; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
; GFX89: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x()