From ef160de3e5af3c8e51928fbe7b096af3d9471880 Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Wed, 16 Mar 2016 20:14:33 +0000
Subject: [PATCH] AMDGPU: Prevent uniform loops from becoming infinite

Summary:
Uniform loops where the branch leaving the loop is predicated on VCCNZ
must be skipped if EXEC = 0, otherwise they will be infinite.

Reviewers: tstellarAMD, arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18137

llvm-svn: 263658
---
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp      |  6 +++++
 .../AMDGPU/uniform-loop-inside-nonuniform.ll       | 28 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll

diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 7dd0d7b..2a645d1 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -137,6 +137,12 @@ bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
          NumInstr < SkipThreshold && I != E; ++I) {
 
       if (I->isBundle() || !I->isBundled())
+        // When a uniform loop is inside non-uniform control flow, the branch
+        // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
+        // when EXEC = 0. We should skip the loop lest it becomes infinite.
+        if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ)
+          return true;
+
         if (++NumInstr >= SkipThreshold)
           return true;
     }
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
new file mode 100644
index 0000000..26927e4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -0,0 +1,28 @@
+;RUN: llc -march=amdgcn -mcpu=verde < %s | FileCheck %s --check-prefix=CHECK
+
+; Test a simple uniform loop that lives inside non-uniform control flow.
+
+;CHECK-LABEL: {{^}}test1:
+;CHECK: s_cbranch_execz
+;CHECK: %loop_body
+define void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) #0 {
+main_body:
+  %cc = icmp eq i32 %p, 0
+  br i1 %cc, label %out, label %loop_body
+
+loop_body:
+  %counter = phi i32 [ 0, %main_body ], [ %incr, %loop_body ]
+
+  ; Prevent the loop from being optimized out
+  call void asm sideeffect "", "" ()
+
+  %incr = add i32 %counter, 1
+  %lc = icmp sge i32 %incr, 1000
+  br i1 %lc, label %out, label %loop_body
+
+out:
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readonly }
-- 
2.7.4