From 1a8713046dfdbc997c30542154790feeaa6dfc14 Mon Sep 17 00:00:00 2001 From: Ayal Zaks Date: Tue, 16 Oct 2018 14:25:02 +0000 Subject: [PATCH] [LV] Add test checks when vectorizing loops under opt for size; NFC Landing this as a separate part of https://reviews.llvm.org/D50480, recording current behavior more accurately, to clarify subsequent diff ([LV] Vectorizing loops of arbitrary trip count without remainder under opt for size). llvm-svn: 344606 --- llvm/test/Transforms/LoopVectorize/X86/optsize.ll | 57 +++++++++++ .../Transforms/LoopVectorize/X86/small-size.ll | 107 ++++++++++++++++++-- .../LoopVectorize/X86/vect.omp.force.small-tc.ll | 108 ++++++++++++++++++--- 3 files changed, 253 insertions(+), 19 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/X86/optsize.ll diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll new file mode 100644 index 0000000..057c720 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -0,0 +1,57 @@ +; This test verifies that the loop vectorizer will NOT vectorize loops that +; will produce a tail loop with the optimize for size or the minimize size +; attributes. This is a target-dependent version of the test. +; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s + +target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" + +@tab = common global [32 x i8] zeroinitializer, align 1 + +define i32 @foo_optsize() #0 { +; CHECK-LABEL: @foo_optsize( +; CHECK-NOT: x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, 202 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 0 +} + +attributes #0 = { optsize } + +define i32 @foo_minsize() #1 { +; CHECK-LABEL: @foo_minsize( +; CHECK-NOT: x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, 202 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 0 +} + +attributes #1 = { minsize } + diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll index 89d69e2..8af7b2e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -20,12 +21,33 @@ target triple = "x86_64-apple-macosx10.8.0" @dj = common global [1024 x i32] zeroinitializer, align 16 ; We can optimize this test without a tail. -;CHECK-LABEL: @example1( -;CHECK: load <4 x i32> -;CHECK: add nsw <4 x i32> -;CHECK: store <4 x i32> -;CHECK: ret void define void @example1() optsize { +; CHECK-LABEL: @example1( +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[TMP10:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[TMP9:%.*]] +; CHECK: br i1 undef, label [[TMP10]], label [[TMP9]], !llvm.loop !2 +; CHECK: ret void +; br label %1 ;