From efa340fbd264c1a1fc3d0c41275536693369f897 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 30 Aug 2021 19:19:51 +0100
Subject: [PATCH] [ARM] Workaround tailpredication min/max costmodel

The min/max intrinsics are not yet canonical, but when they are the tail
predications analysis will change from treating them like icmp to
treating them like intrinsics. Unfortunately, they can currently produce
better code by not being tail predicated thanks to the vectorizer picking
higher VF's and the backend folding to better instructions (especially
for saturate patterns). In the long run we will need to improve the
vectorizers cost modelling, recognizing the instruction directly, but in
the meantime this treats min/max as before to prevent performance
regressions.
---
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp     |  14 ++
 .../Transforms/PhaseOrdering/ARM/arm_mult_q15.ll   | 227 +++++++++++++++++++++
 2 files changed, 241 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index d273482..17a46c1 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1960,6 +1960,20 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
   // we simply count the icmps, i.e. there should only be 1 for the backedge.
   if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
     return false;
+  // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
+  // not currently canonical, but soon will be. Code without them uses icmp, and
+  // so is not tail predicated as per the condition above. In order to get the
+  // same performance we treat min and max the same as an icmp for tailpred
+  // purposes for the moment (we often rely on non-tailpred and higher VF's to
+  // pick more optimial instructions like VQDMULH. They need to be recognized
+  // directly by the vectorizer).
+  if (auto *II = dyn_cast<IntrinsicInst>(&I))
+    if ((II->getIntrinsicID() == Intrinsic::smin ||
+         II->getIntrinsicID() == Intrinsic::smax ||
+         II->getIntrinsicID() == Intrinsic::umin ||
+         II->getIntrinsicID() == Intrinsic::umax) &&
+        ++ICmpCount > 1)
+      return false;
 
   if (isa<FCmpInst>(&I))
     return false;
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
new file mode 100644
index 0000000..c1a57bf
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -0,0 +1,227 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+
+; This test after a lot of cleanup should produce pick a tail-predicated 8x
+; vector loop. The 8x will be more profitable, to pick a VQDMULH.s16 instruction.
+; FIXME: Tailpredicate too, but not at the expense of 8x vectorized.
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+define void @arm_mult_q15(i16* %pSrcA, i16* %pSrcB, i16 * noalias %pDst, i32 %blockSize) #0 {
+; CHECK-LABEL: @arm_mult_q15(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT3:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT3]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER17:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8
+; CHECK-NEXT:    [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i16, i16* [[PSRCA:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i16, i16* [[PSRCB:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRCA]], i32 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i16, i16* [[PDST]], i32 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr i16, i16* [[PSRCB]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[NEXT_GEP15]] to <8 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <8 x i16>, <8 x i16>* [[TMP2]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD16]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[TMP5]], <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP5]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <8 x i32> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16* [[NEXT_GEP14]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* [[TMP9]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER17]]
+; CHECK:       while.body.preheader17:
+; CHECK-NEXT:    [[BLKCNT_07_PH:%.*]] = phi i32 [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[PSRCA_ADDR_06_PH:%.*]] = phi i16* [ [[PSRCA]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[PDST_ADDR_05_PH:%.*]] = phi i16* [ [[PDST]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[PSRCB_ADDR_04_PH:%.*]] = phi i16* [ [[PSRCB]], [[WHILE_BODY_PREHEADER]] ], [ [[IND_END13]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[BLKCNT_07:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_07_PH]], [[WHILE_BODY_PREHEADER17]] ]
+; CHECK-NEXT:    [[PSRCA_ADDR_06:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_06_PH]], [[WHILE_BODY_PREHEADER17]] ]
+; CHECK-NEXT:    [[PDST_ADDR_05:%.*]] = phi i16* [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_05_PH]], [[WHILE_BODY_PREHEADER17]] ]
+; CHECK-NEXT:    [[PSRCB_ADDR_04:%.*]] = phi i16* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_04_PH]], [[WHILE_BODY_PREHEADER17]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_06]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, i16* [[PSRCA_ADDR_06]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP11]] to i32
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_04]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, i16* [[PSRCB_ADDR_04]], align 2
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], 15
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[SHR]], 32767
+; CHECK-NEXT:    [[RETVAL_1_I:%.*]] = select i1 [[TMP13]], i32 [[SHR]], i32 32767
+; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[RETVAL_1_I]] to i16
+; CHECK-NEXT:    [[INCDEC_PTR4]] = getelementptr inbounds i16, i16* [[PDST_ADDR_05]], i32 1
+; CHECK-NEXT:    store i16 [[CONV3]], i16* [[PDST_ADDR_05]], align 2
+; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_07]], -1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %pSrcA.addr = alloca i16*, align 4
+  %pSrcB.addr = alloca i16*, align 4
+  %pDst.addr = alloca i16*, align 4
+  %blockSize.addr = alloca i32, align 4
+  %blkCnt = alloca i32, align 4
+  store i16* %pSrcA, i16** %pSrcA.addr, align 4
+  store i16* %pSrcB, i16** %pSrcB.addr, align 4
+  store i16* %pDst, i16** %pDst.addr, align 4
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  %0 = bitcast i32* %blkCnt to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
+  %1 = load i32, i32* %blockSize.addr, align 4
+  store i32 %1, i32* %blkCnt, align 4
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %2 = load i32, i32* %blkCnt, align 4
+  %cmp = icmp ugt i32 %2, 0
+  br i1 %cmp, label %while.body, label %while.end
+
+while.body:                                       ; preds = %while.cond
+  %3 = load i16*, i16** %pSrcA.addr, align 4
+  %incdec.ptr = getelementptr inbounds i16, i16* %3, i32 1
+  store i16* %incdec.ptr, i16** %pSrcA.addr, align 4
+  %4 = load i16, i16* %3, align 2
+  %conv = sext i16 %4 to i32
+  %5 = load i16*, i16** %pSrcB.addr, align 4
+  %incdec.ptr1 = getelementptr inbounds i16, i16* %5, i32 1
+  store i16* %incdec.ptr1, i16** %pSrcB.addr, align 4
+  %6 = load i16, i16* %5, align 2
+  %conv2 = sext i16 %6 to i32
+  %mul = mul nsw i32 %conv, %conv2
+  %shr = ashr i32 %mul, 15
+  %call = call i32 @__SSAT(i32 %shr, i32 16)
+  %conv3 = trunc i32 %call to i16
+  %7 = load i16*, i16** %pDst.addr, align 4
+  %incdec.ptr4 = getelementptr inbounds i16, i16* %7, i32 1
+  store i16* %incdec.ptr4, i16** %pDst.addr, align 4
+  store i16 %conv3, i16* %7, align 2
+  %8 = load i32, i32* %blkCnt, align 4
+  %dec = add i32 %8, -1
+  store i32 %dec, i32* %blkCnt, align 4
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  %9 = bitcast i32* %blkCnt to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %9) #3
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
+
+define internal i32 @__SSAT(i32 %val, i32 %sat) #2 {
+entry:
+  %retval = alloca i32, align 4
+  %val.addr = alloca i32, align 4
+  %sat.addr = alloca i32, align 4
+  %max = alloca i32, align 4
+  %min = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  store i32 %val, i32* %val.addr, align 4
+  store i32 %sat, i32* %sat.addr, align 4
+  %0 = load i32, i32* %sat.addr, align 4
+  %cmp = icmp uge i32 %0, 1
+  br i1 %cmp, label %land.lhs.true, label %if.end10
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32, i32* %sat.addr, align 4
+  %cmp1 = icmp ule i32 %1, 32
+  br i1 %cmp1, label %if.then, label %if.end10
+
+if.then:                                          ; preds = %land.lhs.true
+  %2 = bitcast i32* %max to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
+  %3 = load i32, i32* %sat.addr, align 4
+  %sub = sub i32 %3, 1
+  %shl = shl i32 1, %sub
+  %sub2 = sub i32 %shl, 1
+  store i32 %sub2, i32* %max, align 4
+  %4 = bitcast i32* %min to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  %5 = load i32, i32* %max, align 4
+  %sub3 = sub nsw i32 -1, %5
+  store i32 %sub3, i32* %min, align 4
+  %6 = load i32, i32* %val.addr, align 4
+  %7 = load i32, i32* %max, align 4
+  %cmp4 = icmp sgt i32 %6, %7
+  br i1 %cmp4, label %if.then5, label %if.else
+
+if.then5:                                         ; preds = %if.then
+  %8 = load i32, i32* %max, align 4
+  store i32 %8, i32* %retval, align 4
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup
+
+if.else:                                          ; preds = %if.then
+  %9 = load i32, i32* %val.addr, align 4
+  %10 = load i32, i32* %min, align 4
+  %cmp6 = icmp slt i32 %9, %10
+  br i1 %cmp6, label %if.then7, label %if.end
+
+if.then7:                                         ; preds = %if.else
+  %11 = load i32, i32* %min, align 4
+  store i32 %11, i32* %retval, align 4
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup
+
+if.end:                                           ; preds = %if.else
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.end
+  store i32 0, i32* %cleanup.dest.slot, align 4
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.end8, %if.then7, %if.then5
+  %12 = bitcast i32* %min to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %12) #3
+  %13 = bitcast i32* %max to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %13) #3
+  %cleanup.dest = load i32, i32* %cleanup.dest.slot, align 4
+  switch i32 %cleanup.dest, label %unreachable [
+  i32 0, label %cleanup.cont
+  i32 1, label %return
+  ]
+
+cleanup.cont:                                     ; preds = %cleanup
+  br label %if.end10
+
+if.end10:                                         ; preds = %cleanup.cont, %land.lhs.true, %entry
+  %14 = load i32, i32* %val.addr, align 4
+  store i32 %14, i32* %retval, align 4
+  br label %return
+
+return:                                           ; preds = %if.end10, %cleanup
+  %15 = load i32, i32* %retval, align 4
+  ret i32 %15
+
+unreachable:                                      ; preds = %cleanup
+  unreachable
+}
+
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
+
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #1 = { argmemonly nofree nosync nounwind willreturn }
+attributes #2 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #3 = { nounwind }
-- 
2.7.4