The "not" is defined as XOR $src -1.
We need to transform this pattern to either S_NOT_B32 or V_NOT_B32_e32
dependent on the "xor" node divergence.
Reviewed By: rampitec, foad
Differential Revision: https://reviews.llvm.org/D115884
} // end isWave32
def : GCNPat <
+ (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),
+ (V_NOT_B32_e32 $src0)
+>;
+
+def : GCNPat <
+ (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))),
+ (REG_SEQUENCE VReg_64,
+ (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0,
+ (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1
+ )
+>;
+
+def : GCNPat <
(f16 (sint_to_fp i1:$src)),
(V_CVT_F16_F32_e32 (
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
let has_sdst = 0;
}
+class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0),
+ (Op $src0),
+ [{ return !N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+}
+
+class UniformBinFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0, node:$src1),
+ (Op $src0, $src1),
+ [{ return !N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+}
+
+class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0, node:$src1),
+ (Op $src0, $src1),
+ [{ return N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+}
+
let isMoveImm = 1 in {
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
let Defs = [SCC] in {
def S_NOT_B32 : SOP1_32 <"s_not_b32",
- [(set i32:$sdst, (not i32:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))]
>;
def S_NOT_B64 : SOP1_64 <"s_not_b64",
- [(set i64:$sdst, (not i64:$src0))]
+ [(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))]
>;
def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
"$sdst, $src0, $src1", pattern
>;
-class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
- (ops node:$src0),
- (Op $src0),
- [{ return !N->isDivergent(); }]> {
- // This check is unnecessary as it's captured by the result register
- // bank constraint.
- //
- // FIXME: Should add a way for the emitter to recognize this is a
- // trivially true predicate to eliminate the check.
- let GISelPredicateCode = [{return true;}];
-}
-
-class UniformBinFrag<SDPatternOperator Op> : PatFrag <
- (ops node:$src0, node:$src1),
- (Op $src0, $src1),
- [{ return !N->isDivergent(); }]> {
- // This check is unnecessary as it's captured by the result register
- // bank constraint.
- //
- // FIXME: Should add a way for the emitter to recognize this is a
- // trivially true predicate to eliminate the check.
- let GISelPredicateCode = [{return true;}];
-}
-
-class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
- (ops node:$src0, node:$src1),
- (Op $src0, $src1),
- [{ return N->isDivergent(); }]> {
- // This check is unnecessary as it's captured by the result register
- // bank constraint.
- //
- // FIXME: Should add a way for the emitter to recognize this is a
- // trivially true predicate to eliminate the check.
- let GISelPredicateCode = [{return true;}];
-}
let Defs = [SCC] in { // Carry out goes to SCC
let isCommutable = 1 in {
--- /dev/null
+; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: name: scalar_not_i32
+; GCN: S_NOT_B32
+define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %val) {
+ %not.val = xor i32 %val, -1
+ store i32 %not.val, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: name: scalar_not_i64
+; GCN: S_NOT_B64
+define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %val) {
+ %not.val = xor i64 %val, -1
+ store i64 %not.val, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: name: vector_not_i32
+; GCN: V_NOT_B32_e32
+define i32 @vector_not_i32(i32 %val) {
+ %not.val = xor i32 %val, -1
+ ret i32 %not.val
+}
+
+; GCN-LABEL: name: vector_not_i64
+; GCN: V_NOT_B32_e32
+; GCN: V_NOT_B32_e32
+define i64 @vector_not_i64(i64 %val) {
+ %not.val = xor i64 %val, -1
+ ret i64 %not.val
+}
+
+
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or3_b32 v1, v3, v1, v5
; GCN-NEXT: v_or3_b32 v0, v2, v0, v4
-; GCN-NEXT: v_not_b32_e32 v0, v0
; GCN-NEXT: v_not_b32_e32 v1, v1
+; GCN-NEXT: v_not_b32_e32 v0, v0
; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
; GCN-NEXT: s_endpgm
bb:
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v1, v1, v5
; GCN-NEXT: v_and_b32_e32 v0, v0, v4
-; GCN-NEXT: v_not_b32_e32 v0, v0
; GCN-NEXT: v_not_b32_e32 v1, v1
+; GCN-NEXT: v_not_b32_e32 v0, v0
; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
; GCN-NEXT: s_endpgm
bb:
; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4
; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
-; SI-NEXT: v_not_b32_e32 v4, v4
; SI-NEXT: v_not_b32_e32 v5, v5
+; SI-NEXT: v_not_b32_e32 v4, v4
; SI-NEXT: v_and_b32_e32 v5, v3, v5
; SI-NEXT: v_and_b32_e32 v4, v2, v4
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6
; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader
; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2
; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc
-; GCN-IR-NEXT: v_not_b32_e32 v6, v8
-; GCN-IR-NEXT: v_not_b32_e32 v7, v9
-; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v10
+; GCN-IR-NEXT: v_not_b32_e32 v7, v8
+; GCN-IR-NEXT: v_not_b32_e32 v6, v9
+; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v7, v10
; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12
-; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v7, v11, vcc
+; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v6, v11, vcc
; GCN-IR-NEXT: v_mov_b32_e32 v10, 0
; GCN-IR-NEXT: v_mov_b32_e32 v11, 0
; GCN-IR-NEXT: v_mov_b32_e32 v7, 0