PS: Submitting on behalf of Jay.
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D100008
defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
+// Prefer ds_read over ds_read2, all other things being equal, because it has
+// a larger immediate offset range.
let AddedComplexity = 100 in {
foreach vt = VReg_64.RegTypes in {
defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
}
+// For performance reasons restrict this to alignment >= 16 even with
+// unaligned-access-mode. At lower alignments ds_read2_b64 is always a better
+// choice.
foreach vt = VReg_128.RegTypes in {
defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
}
defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
}
-foreach vt = VReg_128.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B128, vt, "load_local">;
-}
-
} // End SubtargetPredicate = HasUnalignedAccessMode
} // End SubtargetPredicate = isGFX7Plus
defm : DS128Bit8ByteAlignedPat_mc<vt>;
}
+// Prefer ds_write over ds_write2, all other things being equal, because it has
+// a larger immediate offset range.
let AddedComplexity = 100 in {
foreach vt = VReg_64.RegTypes in {
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_align16_local">;
}
+// For performance reasons restrict this to alignment >= 16 even with
+// unaligned-access-mode. At lower alignments ds_write2_b64 is always a better
+// choice.
foreach vt = VReg_128.RegTypes in {
defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
}
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
}
-foreach vt = VReg_128.RegTypes in {
-defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_local">;
-}
-
} // End SubtargetPredicate = HasUnalignedAccessMode
} // End SubtargetPredicate = isGFX7Plus
; ALIGNED-DAG: ds_read2_b32
; ALIGNED-DAG: ds_write2_b32
; ALIGNED-DAG: ds_write2_b32
-; UNALIGNED-DAG: ds_read_b128
-; UNALIGNED-DAG: ds_write_b128
+; UNALIGNED-DAG: ds_read2_b64
+; UNALIGNED-DAG: ds_write2_b64
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
; GCN-LABEL: test_local_v4_aligned8:
; ALIGNED-DAG: ds_read2_b64
; ALIGNED-DAG: ds_write2_b64
-; UNALIGNED-DAG: ds_read_b128
-; UNALIGNED-DAG: ds_write_b128
+; UNALIGNED-DAG: ds_read2_b64
+; UNALIGNED-DAG: ds_write2_b64
define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
; GFX9-LABEL: load_lds_v4i32_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_read_b128 v[0:3], v0
+; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: store_lds_v4i32_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_write_b128 v0, v[1:4]
+; GFX9-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
+; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
+; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; UNALIGNED-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1
store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
+; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
+; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; UNALIGNED-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2
store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
+; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
+; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; UNALIGNED-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4
store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4
}
define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
-; ALIGNED-LABEL: ds16align8:
-; ALIGNED: ; %bb.0:
-; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; ALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
-; ALIGNED-NEXT: v_mov_b32_e32 v4, s1
-; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
-; ALIGNED-NEXT: s_endpgm
-;
-; UNALIGNED-LABEL: ds16align8:
-; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
-; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
-; UNALIGNED-NEXT: s_endpgm
+; GCN-LABEL: ds16align8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GCN-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8
store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8
ret void
; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4
+; GFX9-UNALIGNED-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
;
; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets:
; GFX9-UNALIGNED: ; %bb.0:
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3]
+; GFX9-UNALIGNED-NEXT: s_movk_i32 s0, 0x7b
+; GFX9-UNALIGNED-NEXT: s_mov_b32 s1, 0
+; GFX9-UNALIGNED-NEXT: s_mov_b32 s2, s0
+; GFX9-UNALIGNED-NEXT: s_mov_b32 s3, s1
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GFX9-UNALIGNED-NEXT: s_endpgm
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
+; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GFX9-UNALIGNED-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
; ALIGNED-DAG: ds_read2_b32
; ALIGNED-DAG: ds_write2_b32
; ALIGNED-DAG: ds_write2_b32
-; UNALIGNED-DAG: ds_read_b128
-; UNALIGNED-DAG: ds_write_b128
+; UNALIGNED-DAG: ds_read2_b64
+; UNALIGNED-DAG: ds_write2_b64
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
; GCN-LABEL: test_local_v4_aligned8:
; ALIGNED-DAG: ds_read2_b64
; ALIGNED-DAG: ds_write2_b64
-; UNALIGNED-DAG: ds_read_b128
-; UNALIGNED-DAG: ds_write_b128
+; UNALIGNED-DAG: ds_read2_b64
+; UNALIGNED-DAG: ds_write2_b64
define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()