--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -march=amdgcn -mcpu=gfx900 -amdgpu-aa -amdgpu-aa-wrapper -amdgpu-annotate-uniform -S < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Check that barrier or fence in between of loads is not considered a clobber
+; for the purpose of converting vector loads into scalar.
+
+@LDS = linkonce_odr hidden local_unnamed_addr addrspace(3) global i32 undef
+
+; GCN-LABEL: {{^}}simple_barrier:
+; GCN: s_load_dword s
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: s_barrier
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: ; wave barrier
+; GCN-NOT: global_load_dword
+; GCN: s_load_dword s
+; GCN-NOT: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @simple_barrier(i32 addrspace(1)* %arg) {
+; CHECK-LABEL: @simple_barrier(
+; CHECK-NEXT: bb:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4
+; CHECK-NEXT: fence syncscope("workgroup") release
+; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: fence syncscope("workgroup") acquire
+; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier()
+; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
+; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2
+; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT: ret void
+;
+bb:
+ %i = load i32, i32 addrspace(1)* %arg, align 4
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ tail call void @llvm.amdgcn.wave.barrier()
+ %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+ %i2 = load i32, i32 addrspace(1)* %i1, align 4
+ %i3 = add i32 %i2, %i
+ %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+ store i32 %i3, i32 addrspace(1)* %i4, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}memory_phi_no_clobber:
+; GCN: s_load_dword s
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: s_barrier
+; GCN-NOT: global_load_dword
+; GCN: s_load_dword s
+; GCN-NOT: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @memory_phi_no_clobber(i32 addrspace(1)* %arg) {
+; CHECK-LABEL: @memory_phi_no_clobber(
+; CHECK-NEXT: bb:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4
+; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
+; CHECK: if.then:
+; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0
+; CHECK: if.else:
+; CHECK-NEXT: fence syncscope("workgroup") release
+; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0
+; CHECK: if.end:
+; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
+; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2
+; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT: ret void
+;
+bb:
+ %i = load i32, i32 addrspace(1)* %arg, align 4
+ br i1 undef, label %if.then, label %if.else
+
+if.then:
+ tail call void @llvm.amdgcn.s.barrier()
+ br label %if.end
+
+if.else:
+ fence syncscope("workgroup") release
+ br label %if.end
+
+if.end:
+ %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+ %i2 = load i32, i32 addrspace(1)* %i1, align 4
+ %i3 = add i32 %i2, %i
+ %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+ store i32 %i3, i32 addrspace(1)* %i4, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}memory_phi_clobber1:
+; GCN: s_load_dword s
+; GCN: s_barrier
+; GCN: global_store_dword
+; GCN: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @memory_phi_clobber1(i32 addrspace(1)* %arg) {
+; CHECK-LABEL: @memory_phi_clobber1(
+; CHECK-NEXT: bb:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4
+; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
+; CHECK: if.then:
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3
+; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0
+; CHECK: if.else:
+; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0
+; CHECK: if.end:
+; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0
+; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
+; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2
+; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT: ret void
+;
+bb:
+ %i = load i32, i32 addrspace(1)* %arg, align 4
+ br i1 undef, label %if.then, label %if.else
+
+if.then:
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+ store i32 1, i32 addrspace(1)* %gep, align 4
+ br label %if.end
+
+if.else:
+ tail call void @llvm.amdgcn.s.barrier()
+ br label %if.end
+
+if.end:
+ %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+ %i2 = load i32, i32 addrspace(1)* %i1, align 4
+ %i3 = add i32 %i2, %i
+ %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+ store i32 %i3, i32 addrspace(1)* %i4, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}memory_phi_clobber2:
+; GCN-DAG: s_load_dword s
+; GCN-DAG: global_store_dword
+; GCN: s_barrier
+; GCN: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @memory_phi_clobber2(i32 addrspace(1)* %arg) {
+; CHECK-LABEL: @memory_phi_clobber2(
+; CHECK-NEXT: bb:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4
+; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
+; CHECK: if.then:
+; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0
+; CHECK: if.else:
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3
+; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0
+; CHECK: if.end:
+; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0
+; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
+; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2
+; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT: ret void
+;
+bb:
+ %i = load i32, i32 addrspace(1)* %arg, align 4
+ br i1 undef, label %if.then, label %if.else
+
+if.then:
+ tail call void @llvm.amdgcn.s.barrier()
+ br label %if.end
+
+if.else:
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+ store i32 1, i32 addrspace(1)* %gep, align 4
+ br label %if.end
+
+if.end:
+ %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+ %i2 = load i32, i32 addrspace(1)* %i1, align 4
+ %i3 = add i32 %i2, %i
+ %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+ store i32 %i3, i32 addrspace(1)* %i4, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}no_clobbering_loop1:
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @no_clobbering_loop1(i32 addrspace(1)* %arg, i1 %cc) {
+; CHECK-LABEL: @no_clobbering_loop1(
+; CHECK-NEXT: bb:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4
+; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
+; CHECK: while.cond:
+; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
+; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2
+; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier()
+; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
+; CHECK: end:
+; CHECK-NEXT: ret void
+;
+bb:
+ %i = load i32, i32 addrspace(1)* %arg, align 4
+ br label %while.cond
+
+while.cond:
+ %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+ %i2 = load i32, i32 addrspace(1)* %i1, align 4
+ %i3 = add i32 %i2, %i
+ %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+ store i32 %i3, i32 addrspace(1)* %i4, align 4
+ tail call void @llvm.amdgcn.wave.barrier()
+ br i1 %cc, label %while.cond, label %end
+
+end:
+ ret void
+}
+
+; GCN-LABEL: {{^}}no_clobbering_loop2:
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @no_clobbering_loop2(i32 addrspace(1)* noalias %arg, i32 addrspace(1)* noalias %out, i32 %n) {
+; CHECK-LABEL: @no_clobbering_loop2(
+; CHECK-NEXT: bb:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4
+; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
+; CHECK: while.cond:
+; CHECK-NEXT: [[C:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT: [[ACC:%.*]] = phi i32 [ [[I]], [[BB]] ], [ [[I3:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i32 [[C]], !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT: [[I3]] = add i32 [[I2]], [[ACC]]
+; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier()
+; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[C]], 1
+; CHECK-NEXT: [[CC:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
+; CHECK-NEXT: br i1 [[CC]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
+; CHECK: end:
+; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+bb:
+ %i = load i32, i32 addrspace(1)* %arg, align 4
+ br label %while.cond
+
+while.cond:
+ %c = phi i32 [ 0, %bb ], [ %inc, %while.cond ]
+ %acc = phi i32 [ %i, %bb ], [ %i3, %while.cond ]
+ %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %c
+ %i2 = load i32, i32 addrspace(1)* %i1, align 4
+ %i3 = add i32 %i2, %acc
+ tail call void @llvm.amdgcn.wave.barrier()
+ %inc = add nuw nsw i32 %c, 1
+ %cc = icmp eq i32 %inc, %n
+ br i1 %cc, label %while.cond, label %end
+
+end:
+ store i32 %i3, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}clobbering_loop:
+; GCN: s_load_dword s
+; GCN: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @clobbering_loop(i32 addrspace(1)* %arg, i32 addrspace(1)* %out, i1 %cc) {
+; CHECK-LABEL: @clobbering_loop(
+; CHECK-NEXT: bb:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4
+; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
+; CHECK: while.cond:
+; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0
+; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
+; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT:%.*]], i64 1
+; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier()
+; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
+; CHECK: end:
+; CHECK-NEXT: ret void
+;
+bb:
+ %i = load i32, i32 addrspace(1)* %arg, align 4
+ br label %while.cond
+
+while.cond:
+ %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+ %i2 = load i32, i32 addrspace(1)* %i1, align 4
+ %i3 = add i32 %i2, %i
+ %i4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+ store i32 %i3, i32 addrspace(1)* %i4, align 4
+ tail call void @llvm.amdgcn.wave.barrier()
+ br i1 %cc, label %while.cond, label %end
+
+end:
+ ret void
+}
+
+; GCN-LABEL: {{^}}clobber_by_atomic_load:
+; GCN: s_load_dword s
+; GCN: global_load_dword {{.*}} glc
+; GCN: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @clobber_by_atomic_load(i32 addrspace(1)* %arg) {
+; CHECK-LABEL: @clobber_by_atomic_load(
+; CHECK-NEXT: bb:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[VAL:%.*]] = load atomic i32, i32 addrspace(1)* [[GEP]] seq_cst, align 4
+; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3, !amdgpu.uniform !0
+; CHECK-NEXT: [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
+; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 4
+; CHECK-NEXT: store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT: ret void
+;
+bb:
+ %i = load i32, i32 addrspace(1)* %arg, align 4
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+ %val = load atomic i32, i32 addrspace(1)* %gep seq_cst, align 4
+ %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+ %i2 = load i32, i32 addrspace(1)* %i1, align 4
+ %i3 = add i32 %i2, %i
+ %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4
+ store i32 %i3, i32 addrspace(1)* %i4, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}no_alias_store:
+; GCN: ds_write_b32
+; GCN: s_barrier
+; GCN: s_load_dword s
+; GCN-NOT: global_load_dword
+; GCN: global_store_dword
+define protected amdgpu_kernel void @no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+; CHECK-LABEL: @no_alias_store(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: store i32 0, i32 addrspace(3)* @LDS, align 4
+; CHECK-NEXT: fence syncscope("workgroup") release
+; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: fence syncscope("workgroup") acquire
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ store i32 0, i32 addrspace(3)* @LDS, align 4
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
+ %ld = load i32, i32 addrspace(1)* %gep, align 4
+ store i32 %ld, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}may_alias_store:
+; GCN: global_store_dword
+; GCN: s_barrier
+; GCN: global_load_dword
+; GCN: global_store_dword
+define protected amdgpu_kernel void @may_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+; CHECK-LABEL: @may_alias_store(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: store i32 0, i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT: fence syncscope("workgroup") release
+; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: fence syncscope("workgroup") acquire
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
+; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ store i32 0, i32 addrspace(1)* %out, align 4
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
+ %ld = load i32, i32 addrspace(1)* %gep, align 4
+ store i32 %ld, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}no_alias_volatile_store:
+; GCN: ds_write_b32
+; GCN: s_barrier
+; GCN: s_load_dword s
+; GCN-NOT: global_load_dword
+; GCN: global_store_dword
+define protected amdgpu_kernel void @no_alias_volatile_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+; CHECK-LABEL: @no_alias_volatile_store(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: store volatile i32 0, i32 addrspace(3)* @LDS, align 4
+; CHECK-NEXT: fence syncscope("workgroup") release
+; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: fence syncscope("workgroup") acquire
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
+; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ store volatile i32 0, i32 addrspace(3)* @LDS, align 4
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
+ %ld = load i32, i32 addrspace(1)* %gep, align 4
+ store i32 %ld, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier()
+declare void @llvm.amdgcn.wave.barrier()