// SI pseudo instructions. These are used by the CFG structurizer pass
// and should be lowered to ISA instructions prior to codegen.
-let isTerminator = 1 in {
+// As we have enhanced control flow intrinsics to work under unstructured CFG,
+// duplicating such intrinsics can be actually treated as legal. On the contrary,
+// by making them non-duplicable, we are observing better code generation result.
+// So we choose to mark them non-duplicable in hope of getting better code
+// generation as well as simplied CFG during Machine IR optimization stage.
+
+let isTerminator = 1, isNotDuplicable = 1 in {
let OtherPredicates = [EnableLateCFGStructurize] in {
def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
let hasSideEffects = 1;
+ let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details.
let mayLoad = 1; // FIXME: Should not need memory flags
let mayStore = 1;
}
def SI_IF_BREAK : CFPseudoInstSI <
(outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
let Size = 4;
+ let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details.
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
}
; GFX10-NEXT: ; in Loop: Header=BB0_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s2, s0, s2
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_execz .LBB0_5
; GFX10-NEXT: .LBB0_2: ; %bb4
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_and_b32 s0, exec_lo, vcc_lo
-; GFX10-NEXT: s_or_b32 s2, s0, s2
; GFX10-NEXT: s_and_saveexec_b32 s3, s1
; GFX10-NEXT: s_cbranch_execz .LBB0_1
; GFX10-NEXT: ; %bb.3: ; in Loop: Header=BB0_2 Depth=1
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=early-tailduplication -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: stop_duplicate_cfg_intrinsic
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: stop_duplicate_cfg_intrinsic
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %6, %bb.3
+ ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[PHI]], [[COPY2]], 0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_CO_U32_e64_]], %bb.2, [[PHI]], %bb.1
+ ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
+ ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+ ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET [[PHI1]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ bb.1:
+ liveins: $vgpr0
+
+ %0:vgpr_32 = COPY $vgpr0
+ %12:sreg_64 = IMPLICIT_DEF
+ %4:sreg_32 = S_MOV_B32 0
+ %14:vgpr_32 = COPY %4:sreg_32
+ %5:sreg_64_xexec = V_CMP_EQ_U32_e64 %0:vgpr_32, %14:vgpr_32, implicit $exec
+
+ bb.2:
+ %6:vgpr_32 = PHI %4:sreg_32, %bb.1, %11:vgpr_32, %bb.4
+ %8:sreg_64_xexec = SI_IF %5:sreg_64_xexec, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.3:
+ SI_END_CF %8:sreg_64_xexec, implicit-def $exec, implicit-def $scc, implicit $exec
+ %13:sreg_32 = S_MOV_B32 1
+ %15:vgpr_32 = COPY %13:sreg_32
+ %10:vgpr_32, dead %20:sreg_64_xexec = V_ADD_CO_U32_e64 %6:vgpr_32, %15:vgpr_32, 0, implicit $exec
+
+ bb.4:
+ %11:vgpr_32 = PHI %10:vgpr_32, %bb.3, %6:vgpr_32, %bb.2
+ %16:sreg_32 = S_MOV_B32 4294967295
+ %17:sreg_32 = S_MOV_B32 61440
+ %18:sreg_64 = REG_SEQUENCE %16:sreg_32, %subreg.sub0, %17:sreg_32, %subreg.sub1
+ %19:sgpr_128 = REG_SEQUENCE %12:sreg_64, %subreg.sub0_sub1, %18:sreg_64, %subreg.sub2_sub3
+ BUFFER_STORE_DWORD_OFFSET %11:vgpr_32, %19:sgpr_128, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.2
+
+...