From d5387ec2679b5449b81449c5ab3cb0e2fd754010 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 11 Feb 2021 16:42:35 +0000
Subject: [PATCH] [LV] Add tests showing suboptimal vectorization for narrow
 types.

This patch adds additional test cases showing missing/sub-optimal
vectorization for loops which contain small and wider memory ops on
AArch64.
---
 ...vectorization-factor-for-unprofitable-memops.ll | 124 +++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
new file mode 100644
index 0000000..3aef7a4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
@@ -0,0 +1,124 @@
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -S %s | FileCheck %s
+
+; Test cases for extending the vectorization factor, if small memory operations
+; are not profitable.
+
+; Test with a loop that contains memory accesses of i8 and i32 types. The
+; default maximum VF for NEON is 4, but vectorizing 4 x i8 is not
+; profitable. But we can extend to VF to 8 or 16, at which point the
+; i8 memory accesses become profitable.
+define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
+; CHECK-LABEL: @test_load_i8_store_i32(
+; CHECK-NOT: x i8>
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv
+  %lv = load i8, i8* %gep.src, align 1
+  %lv.ext = zext i8 %lv to i32
+  %add = add i32 %lv.ext, %off
+  %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv
+  store i32 %add, i32* %gep.dst
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Same as test_load_i8_store_i32, but with types flipped for load and store.
+define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
+; CHECK-LABEL: @test_load_i32_store_i8(
+; CHECK:     <4 x i8>
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
+  %lv = load i32, i32* %gep.src, align 1
+  %add = add i32 %lv, %off
+  %add.trunc = trunc i32 %add to i8
+  %gep.dst = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %add.trunc, i8* %gep.dst
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; All memory operations use i32, all memory operations are profitable with VF 4.
+define void @test_load_i32_store_i32(i32* noalias %src, i32* noalias %dst, i8 %off, i64 %N) {
+; CHECK-LABEL: @test_load_i32_store_i32(
+; CHECK: vector.body:
+; CHECK:   <4 x i32>
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
+  %lv = load i32, i32* %gep.src, align 1
+  %lv.trunc = trunc i32 %lv to i8
+  %add = add i8 %lv.trunc, %off
+  %add.ext = zext i8 %add to i32
+  %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv
+  store i32 %add.ext, i32* %gep.dst
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Test with loop body that requires a large number of vector registers if the
+; vectorization factor is large. Make sure the register estimates limit the
+; vectorization factor.
+define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
+; CHECK-LABEL: @test_load_i8_store_i64_large
+; CHECK: <2 x i64>
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv
+  %gep.dst.3 = getelementptr inbounds i64, i64* %dst.3, i64 %iv
+  %lv.dst.3 = load i64, i64* %gep.dst.3, align 1
+  %gep.dst.5 = getelementptr inbounds i64, i64* %dst.5, i64 %iv
+  %lv.dst.5 = load i64, i64* %gep.dst.3, align 1
+
+  %lv = load i8, i8* %gep.src, align 1
+  %lv.ext = zext i8 %lv to i64
+  %add = add i64 %lv.ext, %off
+  %add.2 = add i64 %add, %off.2
+  %gep.dst = getelementptr inbounds i64, i64* %dst, i64 %iv
+  %gep.dst.2 = getelementptr inbounds i64, i64* %dst.2, i64 %iv
+
+  %add.3 = add i64 %add.2, %lv.dst.3
+  %add.4 = add i64 %add.3, %add
+  %gep.dst.4 = getelementptr inbounds i64, i64* %dst.4, i64 %iv
+  %add.5 = add i64 %add.2, %lv.dst.5
+  store i64 %add.2, i64* %gep.dst.2
+  store i64 %add, i64* %gep.dst
+  store i64 %add.3, i64* %gep.dst.3
+  store i64 %add.4, i64* %gep.dst.4
+  store i64 %add.5, i64* %gep.dst.5
+
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
-- 
2.7.4