def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
- def int_AMDGPU_barrier_local : Intrinsic<[], [], []>;
- def int_AMDGPU_barrier_global : Intrinsic<[], [], []>;
+ def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>;
+ def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>;
}
// Legacy names for compatibility.
declare i32 @llvm.r600.read.tidig.x() #3
attributes #0 = { nounwind }
-attributes #1 = { nounwind noduplicate }
+attributes #1 = { nounwind convergent }
attributes #3 = { nounwind readnone }
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
declare i32 @llvm.SI.tid() nounwind readnone
-declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+declare void @llvm.AMDGPU.barrier.local() nounwind convergent
; The required pointer calculations for the alloca'd actually requires
; an add and won't be folded into the addressing, which fails with a
%alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
store i32 %result, i32* %alloca_ptr, align 4
; Dummy call
- call void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+ call void @llvm.AMDGPU.barrier.local() nounwind convergent
%reload = load i32, i32* %alloca_ptr, align 4
%out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }
}
attributes #0 = { nounwind readnone }
-attributes #1 = { noduplicate nounwind }
+attributes #1 = { convergent nounwind }
attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
-attributes #2 = { nounwind noduplicate convergent }
+attributes #2 = { nounwind convergent }
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tidig.y() #1
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
declare void @llvm.AMDGPU.barrier.local() #2
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tidig.y() #1
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
declare void @llvm.AMDGPU.barrier.local() #2
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tidig.y() #1
-; Function Attrs: noduplicate nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
-
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tidig.y() #1
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
declare void @llvm.AMDGPU.barrier.local() #2
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tidig.y() #1
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
declare void @llvm.AMDGPU.barrier.local() #2
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }
declare i32 @llvm.r600.read.tidig.x() #3
attributes #0 = { nounwind }
-attributes #1 = { nounwind noduplicate }
+attributes #1 = { nounwind convergent }
attributes #3 = { nounwind readnone }
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
-declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() convergent nounwind
; SI-LABEL: {{^}}private_access_f64_alloca:
%array = alloca double, i32 16, align 8
%ptr = getelementptr double, double* %array, i32 %b
store double %val, double* %ptr, align 8
- call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+ call void @llvm.AMDGPU.barrier.local() convergent nounwind
%result = load double, double* %ptr, align 8
store double %result, double addrspace(1)* %out, align 8
ret void
%array = alloca <2 x double>, i32 16, align 16
%ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b
store <2 x double> %val, <2 x double>* %ptr, align 16
- call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+ call void @llvm.AMDGPU.barrier.local() convergent nounwind
%result = load <2 x double>, <2 x double>* %ptr, align 16
store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
ret void
%array = alloca i64, i32 16, align 8
%ptr = getelementptr i64, i64* %array, i32 %b
store i64 %val, i64* %ptr, align 8
- call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+ call void @llvm.AMDGPU.barrier.local() convergent nounwind
%result = load i64, i64* %ptr, align 8
store i64 %result, i64 addrspace(1)* %out, align 8
ret void
%array = alloca <2 x i64>, i32 16, align 16
%ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b
store <2 x i64> %val, <2 x i64>* %ptr, align 16
- call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+ call void @llvm.AMDGPU.barrier.local() convergent nounwind
%result = load <2 x i64>, <2 x i64>* %ptr, align 16
store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
ret void
; FIXME: Enable for VI.
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
declare void @llvm.AMDGPU.barrier.local() #1
attributes #0 = { nounwind }
-attributes #1 = { noduplicate nounwind }
+attributes #1 = { convergent nounwind }
; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
-declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+declare void @llvm.AMDGPU.barrier.local() nounwind convergent
; SI-LABEL: {{^}}main(
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #2 = { nounwind noduplicate }
+attributes #2 = { nounwind convergent }
ret void
}
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
declare void @llvm.AMDGPU.barrier.local() #2
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }
}
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
declare void @llvm.AMDGPU.barrier.global() #1
; Function Attrs: nounwind readnone
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
attributes #0 = { "ShaderType"="1" }
-attributes #1 = { noduplicate nounwind }
+attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
!0 = !{!1, !1, i64 0, i32 1}
--- /dev/null
+if not 'AMDGPU' in config.root.targets:
+ config.unsupported = True
+
--- /dev/null
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -loop-unroll -S < %s | FileCheck %s
+
+; CHECK-LABEL: @test_unroll_convergent_barrier(
+; CHECK: call void @llvm.AMDGPU.barrier.global()
+; CHECK: call void @llvm.AMDGPU.barrier.global()
+; CHECK: call void @llvm.AMDGPU.barrier.global()
+; CHECK: call void @llvm.AMDGPU.barrier.global()
+; CHECK-NOT: br
+define void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+ %arrayidx.in = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %indvars.iv
+ %arrayidx.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %indvars.iv
+ %load = load i32, i32 addrspace(1)* %arrayidx.in
+ call void @llvm.AMDGPU.barrier.global() #1
+ %add = add i32 %load, %sum.02
+ store i32 %add, i32 addrspace(1)* %arrayidx.out
+ %indvars.iv.next = add i32 %indvars.iv, 1
+ %exitcond = icmp eq i32 %indvars.iv.next, 4
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare void @llvm.AMDGPU.barrier.global() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind convergent }