From 6c348e4067b0826449caef2f77af2fb67c27040a Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Thu, 16 Jul 2020 15:55:50 +0100
Subject: [PATCH] [HWLoops] Stop converting to a while loop when it would be
 unsafe to

There were cases where a do-while loop would be converted to a while
loop before finding out that it would be unsafe to expand the SCEV in
this situation and then bailing out of hardware loop conversion.

This patch checks if it would be unsafe to expand the SCEV and if so stops converting the do-while into a while, allowing conversion to a hardware loop.

Differential Revision: https://reviews.llvm.org/D83953
---
 llvm/lib/CodeGen/HardwareLoops.cpp                 | 11 ++-
 .../CodeGen/Thumb2/LowOverheadLoops/exitcount.ll   | 89 ++++++++++++++++++++++
 2 files changed, 98 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index 0ba7e92..ffffc7c 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -402,8 +402,15 @@ Value *HardwareLoop::InitLoopCount() {
 
   BasicBlock *BB = L->getLoopPreheader();
   if (UseLoopGuard && BB->getSinglePredecessor() &&
-      cast<BranchInst>(BB->getTerminator())->isUnconditional())
-    BB = BB->getSinglePredecessor();
+      cast<BranchInst>(BB->getTerminator())->isUnconditional()) {
+    BasicBlock *Predecessor = BB->getSinglePredecessor();
+    // If it's not safe to create a while loop then don't force it and create a
+    // do-while loop instead
+    if (!isSafeToExpandAt(ExitCount, Predecessor->getTerminator(), SE))
+        UseLoopGuard = false;
+    else
+        BB = Predecessor;
+  }
 
   if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) {
     LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount "
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
new file mode 100644
index 0000000..162ccf5
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
+%struct.SpeexPreprocessState_ = type { i32, i32, half*, half* }
+
+define void @foo(%struct.SpeexPreprocessState_* nocapture readonly %st, i16* %x) {
+; CHECK-LABEL: foo:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK:    dlstp.16 lr, r4
+; CHECK-NEXT:  .LBB0_1: @ %do.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrh.u16 q0, [r2], #16
+; CHECK-NEXT:    vstrh.16 q0, [r3], #16
+; CHECK-NEXT:    letp lr, .LBB0_1
+; CHECK:    dlstp.16 lr, r3
+; CHECK-NEXT:  .LBB0_3: @ %do.body6
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
+; CHECK-NEXT:    vcvt.f16.s16 q1, q1
+; CHECK-NEXT:    vmul.f16 q1, q1, q0
+; CHECK-NEXT:    vstrh.16 q1, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB0_3
+; CHECK-NEXT:  @ %bb.4: @ %do.end13
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %ps_size = getelementptr inbounds %struct.SpeexPreprocessState_, %struct.SpeexPreprocessState_* %st, i32 0, i32 1
+  %0 = load i32, i32* %ps_size, align 4
+  %mul = shl nsw i32 %0, 1
+  %frame_size = getelementptr inbounds %struct.SpeexPreprocessState_, %struct.SpeexPreprocessState_* %st, i32 0, i32 0
+  %1 = load i32, i32* %frame_size, align 4
+  %sub = sub nsw i32 %mul, %1
+  %inbuf = getelementptr inbounds %struct.SpeexPreprocessState_, %struct.SpeexPreprocessState_* %st, i32 0, i32 3
+  %2 = load half*, half** %inbuf, align 4
+  %frame = getelementptr inbounds %struct.SpeexPreprocessState_, %struct.SpeexPreprocessState_* %st, i32 0, i32 2
+  %3 = load half*, half** %frame, align 4
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %pinbuff16.0 = phi half* [ %2, %entry ], [ %add.ptr, %do.body ]
+  %blkCnt.0 = phi i32 [ %sub, %entry ], [ %sub2, %do.body ]
+  %pframef16.0 = phi half* [ %3, %entry ], [ %add.ptr1, %do.body ]
+  %4 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blkCnt.0)
+  %5 = bitcast half* %pinbuff16.0 to <8 x half>*
+  %6 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %5, i32 2, <8 x i1> %4, <8 x half> zeroinitializer)
+  %7 = bitcast half* %pframef16.0 to <8 x half>*
+  tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %6, <8 x half>* %7, i32 2, <8 x i1> %4)
+  %add.ptr = getelementptr inbounds half, half* %pinbuff16.0, i32 8
+  %add.ptr1 = getelementptr inbounds half, half* %pframef16.0, i32 8
+  %sub2 = add nsw i32 %blkCnt.0, -8
+  %cmp = icmp sgt i32 %blkCnt.0, 8
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  %8 = load half*, half** %frame, align 4
+  %add.ptr4 = getelementptr inbounds half, half* %8, i32 %sub
+  %9 = load i32, i32* %frame_size, align 4
+  br label %do.body6
+
+do.body6:                                         ; preds = %do.body6, %do.end
+  %px.0 = phi i16* [ %x, %do.end ], [ %add.ptr8, %do.body6 ]
+  %blkCnt.1 = phi i32 [ %9, %do.end ], [ %sub10, %do.body6 ]
+  %pframef16.1 = phi half* [ %add.ptr4, %do.end ], [ %add.ptr9, %do.body6 ]
+  %10 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blkCnt.1)
+  %11 = bitcast i16* %px.0 to <8 x i16>*
+  %12 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %11, i32 2, <8 x i1> %10, <8 x i16> zeroinitializer)
+  %13 = tail call fast <8 x half> @llvm.arm.mve.vcvt.fp.int.predicated.v8f16.v8i16.v8i1(<8 x i16> %12, i32 0, <8 x i1> %10, <8 x half> undef)
+  %14 = tail call fast <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %13, <8 x half> <half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800, half 0xH1800>, <8 x i1> %10, <8 x half> undef)
+  %15 = bitcast half* %pframef16.1 to <8 x half>*
+  tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %14, <8 x half>* %15, i32 2, <8 x i1> %10)
+  %add.ptr8 = getelementptr inbounds i16, i16* %px.0, i32 8
+  %add.ptr9 = getelementptr inbounds half, half* %pframef16.1, i32 8
+  %sub10 = add nsw i32 %blkCnt.1, -8
+  %cmp12 = icmp sgt i32 %blkCnt.1, 8
+  br i1 %cmp12, label %do.body6, label %do.end13
+
+do.end13:                                         ; preds = %do.body6
+  ret void
+}
+
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+
+declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32 immarg, <8 x i1>, <8 x half>)
+
+declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>)
+
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
+
+declare <8 x half> @llvm.arm.mve.vcvt.fp.int.predicated.v8f16.v8i16.v8i1(<8 x i16>, i32, <8 x i1>, <8 x half>)
+
+declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>)
-- 
2.7.4