--- /dev/null
+; RUN: opt -mtriple=amdgcn-- -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-- -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-- -passes='simple-loop-unswitch<nontrivial>' -verify-memoryssa -S < %s | FileCheck %s
+
+declare i32 @a()
+declare i32 @b()
+declare i32 @c()
+
+; Non-trivial loop unswitching where there are two distinct trivial
+; conditions to unswitch within the loop. The conditions are divergent
+; and should not unswitch.
+define void @test1(ptr %ptr, i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: @test1(
+entry:
+ br label %loop_begin
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label %loop_begin
+
+loop_begin:
+ br i1 %cond1, label %loop_a, label %loop_b
+; CHECK: loop_begin:
+; CHECK-NEXT: br i1 %cond1, label %loop_a, label %loop_b
+
+loop_a:
+ %unused.a = call i32 @a()
+ br label %latch
+; CHECK: loop_a:
+; CHECK-NEXT: %unused.a = call i32 @a()
+; CHECK-NEXT: br label %latch
+
+loop_b:
+ br i1 %cond2, label %loop_b_a, label %loop_b_b
+; CHECK: loop_b:
+; CHECK-NEXT: br i1 %cond2, label %loop_b_a, label %loop_b_b
+
+loop_b_a:
+ %unused.b = call i32 @b()
+ br label %latch
+; CHECK: loop_b_a:
+; CHECK-NEXT: %unused.b = call i32 @b()
+; CHECK-NEXT: br label %latch
+
+loop_b_b:
+ %unused.c = call i32 @c()
+ br label %latch
+; CHECK: loop_b_b:
+; CHECK-NEXT: %unused.c = call i32 @c()
+; CHECK-NEXT: br label %latch
+
+latch:
+ %v = load i1, ptr %ptr
+ br i1 %v, label %loop_begin, label %loop_exit
+; CHECK: latch:
+; CHECK-NEXT: %v = load i1, ptr %ptr
+; CHECK-NEXT: br i1 %v, label %loop_begin, label %loop_exit
+
+loop_exit:
+ ret void
+; CHECK: loop_exit:
+; CHECK-NEXT: ret void
+}
+
+; Non-trivial loop unswitching where there are two distinct trivial
+; conditions to unswitch within the loop. The conditions are known to
+; be uniform, so it should be unswitchable. However, unswitch
+; currently does not make use of UniformityAnalysis.
+define amdgpu_kernel void @test1_uniform(ptr %ptr, i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: @test1_uniform(
+entry:
+ br label %loop_begin
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label %loop_begin
+
+loop_begin:
+ br i1 %cond1, label %loop_a, label %loop_b
+; CHECK: loop_begin:
+; CHECK-NEXT: br i1 %cond1, label %loop_a, label %loop_b
+
+loop_a:
+ %unused.a = call i32 @a()
+ br label %latch
+; CHECK: loop_a:
+; CHECK-NEXT: %unused.a = call i32 @a()
+; CHECK-NEXT: br label %latch
+
+loop_b:
+ br i1 %cond2, label %loop_b_a, label %loop_b_b
+; CHECK: loop_b:
+; CHECK-NEXT: br i1 %cond2, label %loop_b_a, label %loop_b_b
+
+loop_b_a:
+ %unused.b = call i32 @b()
+ br label %latch
+; CHECK: loop_b_a:
+; CHECK-NEXT: %unused.b = call i32 @b()
+; CHECK-NEXT: br label %latch
+
+loop_b_b:
+ %unused.c = call i32 @c()
+ br label %latch
+; CHECK: loop_b_b:
+; CHECK-NEXT: %unused.c = call i32 @c()
+; CHECK-NEXT: br label %latch
+
+latch:
+ %v = load i1, ptr %ptr
+ br i1 %v, label %loop_begin, label %loop_exit
+; CHECK: latch:
+; CHECK-NEXT: %v = load i1, ptr %ptr
+; CHECK-NEXT: br i1 %v, label %loop_begin, label %loop_exit
+
+loop_exit:
+ ret void
+; CHECK: loop_exit:
+; CHECK-NEXT: ret void
+}
+
+; Non-trivial loop unswitching where there are two distinct trivial
+; conditions to unswitch within the loop. There is no divergence
+; because it's assumed it can only execute with a workgroup of size 1.
+define void @test1_single_lane_execution(ptr %ptr, i1 %cond1, i1 %cond2) #0 {
+; CHECK-LABEL: @test1_single_lane_execution(
+entry:
+ br label %loop_begin
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label %loop_begin
+
+loop_begin:
+ br i1 %cond1, label %loop_a, label %loop_b
+; CHECK: loop_begin:
+; CHECK-NEXT: br i1 %cond1, label %loop_a, label %loop_b
+
+loop_a:
+ %unused.a = call i32 @a()
+ br label %latch
+; CHECK: loop_a:
+; CHECK-NEXT: %unused.a = call i32 @a()
+; CHECK-NEXT: br label %latch
+
+loop_b:
+ br i1 %cond2, label %loop_b_a, label %loop_b_b
+; CHECK: loop_b:
+; CHECK-NEXT: br i1 %cond2, label %loop_b_a, label %loop_b_b
+
+loop_b_a:
+ %unused.b = call i32 @b()
+ br label %latch
+; CHECK: loop_b_a:
+; CHECK-NEXT: %unused.b = call i32 @b()
+; CHECK-NEXT: br label %latch
+
+loop_b_b:
+ %unused.c = call i32 @c()
+ br label %latch
+; CHECK: loop_b_b:
+; CHECK-NEXT: %unused.c = call i32 @c()
+; CHECK-NEXT: br label %latch
+
+latch:
+ %v = load i1, ptr %ptr
+ br i1 %v, label %loop_begin, label %loop_exit
+; CHECK: latch:
+; CHECK-NEXT: %v = load i1, ptr %ptr
+; CHECK-NEXT: br i1 %v, label %loop_begin, label %loop_exit
+
+loop_exit:
+ ret void
+; CHECK: loop_exit:
+; CHECK-NEXT: ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,1" }