[ImmArg<0>, IntrNoMem, IntrHasSideEffects]>;
def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
- Intrinsic<[], [], [IntrConvergent]>;
+ Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent]>;
def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">,
Intrinsic<[], [], [IntrConvergent]>;
[(int_amdgcn_s_barrier)]> {
let SchedRW = [WriteBarrier];
let simm16 = 0;
- let mayLoad = 1;
- let mayStore = 1;
let isConvergent = 1;
}
; VARIANT0-NEXT: v_mov_b32_e32 v2, 0
; VARIANT0-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
-; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; VARIANT0-NEXT: s_barrier
+; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2
; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
; VARIANT1-NEXT: v_mov_b32_e32 v2, 0
; VARIANT1-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
-; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT1-NEXT: s_barrier
+; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2
; VARIANT1-NEXT: s_waitcnt expcnt(0)
; VARIANT2-LABEL: test_barrier:
; VARIANT2: ; %bb.0: ; %entry
; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VARIANT2-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c
+; VARIANT2-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; VARIANT2-NEXT: s_waitcnt lgkmcnt(0)
-; VARIANT2-NEXT: v_mov_b32_e32 v2, s3
-; VARIANT2-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1
-; VARIANT2-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; VARIANT2-NEXT: global_store_dword v[1:2], v0, off
+; VARIANT2-NEXT: v_mov_b32_e32 v4, s3
+; VARIANT2-NEXT: v_xad_u32 v1, v0, -1, s0
+; VARIANT2-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
+; VARIANT2-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; VARIANT2-NEXT: global_store_dword v[3:4], v0, off
+; VARIANT2-NEXT: v_mov_b32_e32 v5, s3
+; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1
+; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
; VARIANT2-NEXT: s_waitcnt vmcnt(0)
; VARIANT2-NEXT: s_barrier
-; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s0
-; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4]
-; VARIANT2-NEXT: v_mov_b32_e32 v0, s3
-; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
-; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
-; VARIANT2-NEXT: global_load_dword v0, v[3:4], off
+; VARIANT2-NEXT: global_load_dword v0, v[0:1], off
; VARIANT2-NEXT: s_waitcnt vmcnt(0)
-; VARIANT2-NEXT: global_store_dword v[1:2], v0, off
+; VARIANT2-NEXT: global_store_dword v[3:4], v0, off
; VARIANT2-NEXT: s_endpgm
;
; VARIANT3-LABEL: test_barrier:
; VARIANT3: ; %bb.0: ; %entry
; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VARIANT3-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c
+; VARIANT3-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; VARIANT3-NEXT: s_waitcnt lgkmcnt(0)
-; VARIANT3-NEXT: v_mov_b32_e32 v2, s3
-; VARIANT3-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1
-; VARIANT3-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; VARIANT3-NEXT: global_store_dword v[1:2], v0, off
-; VARIANT3-NEXT: s_barrier
-; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s0
-; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4]
-; VARIANT3-NEXT: v_mov_b32_e32 v0, s3
+; VARIANT3-NEXT: v_mov_b32_e32 v4, s3
+; VARIANT3-NEXT: v_xad_u32 v1, v0, -1, s0
+; VARIANT3-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; VARIANT3-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
-; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
-; VARIANT3-NEXT: global_load_dword v0, v[3:4], off
+; VARIANT3-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; VARIANT3-NEXT: global_store_dword v[3:4], v0, off
+; VARIANT3-NEXT: v_mov_b32_e32 v5, s3
+; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1
+; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
+; VARIANT3-NEXT: s_barrier
+; VARIANT3-NEXT: global_load_dword v0, v[0:1], off
; VARIANT3-NEXT: s_waitcnt vmcnt(0)
-; VARIANT3-NEXT: global_store_dword v[1:2], v0, off
+; VARIANT3-NEXT: global_store_dword v[3:4], v0, off
; VARIANT3-NEXT: s_endpgm
entry:
%tmp = call i32 @llvm.amdgcn.workitem.id.x()
; GCN-LABEL: {{^}}local_memory_two_objects:
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
+; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
-
-; GCN: s_barrier
-
; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
+; GCN: s_barrier
+
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
-
-; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
--- /dev/null
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+---
+# Check that the high latency loads are both scheduled first, before the
+# multiplies, despite the presence of a barrier in the function.
+# CHECK: BUFFER_LOAD_DWORD_OFFSET
+# CHECK: BUFFER_LOAD_DWORD_OFFSET
+# CHECK: V_MUL_LO_U32
+# CHECK: V_MUL_LO_U32
+name: test
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+
+ undef %43.sub3:vreg_128 = COPY $vgpr9
+ undef %42.sub2:vreg_128 = COPY $vgpr8
+ undef %41.sub1:vreg_128 = COPY $vgpr7
+ undef %26.sub0:vreg_128 = COPY $vgpr6
+ undef %46.sub3:vreg_128 = COPY $vgpr5
+ undef %45.sub2:vreg_128 = COPY $vgpr4
+ undef %44.sub1:vreg_128 = COPY $vgpr3
+ undef %32.sub0:vreg_128 = COPY $vgpr2
+ undef %38.sub1:vreg_64 = COPY $vgpr1
+ %38.sub0:vreg_64 = COPY $vgpr0
+
+ S_BARRIER
+
+ undef %33.sub0:sgpr_128 = V_READFIRSTLANE_B32 %32.sub0, implicit $exec
+ %33.sub1:sgpr_128 = V_READFIRSTLANE_B32 %44.sub1, implicit $exec
+ %33.sub2:sgpr_128 = V_READFIRSTLANE_B32 %45.sub2, implicit $exec
+ %33.sub3:sgpr_128 = V_READFIRSTLANE_B32 %46.sub3, implicit $exec
+ %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, implicit $exec
+ %39:vgpr_32 = V_MUL_LO_U32 %15, %15, implicit $exec
+
+ undef %27.sub0:sgpr_128 = V_READFIRSTLANE_B32 %26.sub0, implicit $exec
+ %27.sub1:sgpr_128 = V_READFIRSTLANE_B32 %41.sub1, implicit $exec
+ %27.sub2:sgpr_128 = V_READFIRSTLANE_B32 %42.sub2, implicit $exec
+ %27.sub3:sgpr_128 = V_READFIRSTLANE_B32 %43.sub3, implicit $exec
+ %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, implicit $exec
+ %40:vgpr_32 = V_MUL_LO_U32 %19, %19, implicit $exec
+
+ %23:vgpr_32 = V_ADD_U32_e32 %39, %40, implicit $exec
+ GLOBAL_STORE_DWORD %38, %23, 0, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+
+; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+
; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
; GCN-LABEL: barrier_vmcnt_global:
; GFX8: flat_load_dword
; GFX9_10: global_load_dword
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}}
+; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9_10: s_waitcnt vmcnt(0){{$}}
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) {
bb:
; GCN-LABEL: barrier_vscnt_global:
; GFX8: flat_store_dword
; GFX9_10: global_store_dword
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX9-NEXT: s_waitcnt vmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9: s_waitcnt vmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) {
bb:
; GCN-LABEL: barrier_vmcnt_vscnt_global:
; GFX8: flat_load_dword
; GFX9_10: global_load_dword
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9_10: s_waitcnt vmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) {
bb:
; GCN-LABEL: barrier_vmcnt_flat:
; GCN: flat_load_dword
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) {
bb:
; GCN-LABEL: barrier_vscnt_flat:
; GCN: flat_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10: s_waitcnt lgkmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) {
bb:
; GCN-LABEL: barrier_vmcnt_vscnt_flat:
; GCN: flat_load_dword
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) {
bb:
; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
; GCN: flat_load_dword
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
bb:
; GCN-LABEL: store_vscnt_private:
; GCN: buffer_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8_9: s_waitcnt vmcnt(0)
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_setpc_b64
define void @store_vscnt_private(i32 addrspace(5)* %p) {
store i32 0, i32 addrspace(5)* %p
; GCN-LABEL: store_vscnt_global:
; GFX8: flat_store_dword
; GFX9_10: global_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8_9: s_waitcnt vmcnt(0)
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_setpc_b64
define void @store_vscnt_global(i32 addrspace(1)* %p) {
store i32 0, i32 addrspace(1)* %p
; GCN-LABEL: store_vscnt_flat:
; GCN: flat_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10: s_waitcnt lgkmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_setpc_b64
define void @store_vscnt_flat(i32* %p) {
store i32 0, i32* %p
}
; GCN-LABEL: function_prologue:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_setpc_b64
define void @function_prologue() {
ret void