From e3ea97b04962334e15047b26fbbbc04c90c78946 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 5 Aug 2019 11:12:23 +0000
Subject: [PATCH] [AArch64] Skip isZIPMask check for masks with an odd number
 of elements.

We process 2 elements at a time and expect the number of elements to be
even. Similar to D60690.

Reviewers: dmgreen, samparker, t.p.northover

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D65400

llvm-svn: 367831
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    |  2 ++
 .../AArch64/arm64-neon-vector-shuffle-extract.ll   | 26 ++++++++++++++++++++++
 2 files changed, 28 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6a7fdd4..d8c12eb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6312,6 +6312,8 @@ static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
 
 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts % 2 != 0)
+    return false;
   WhichResult = (M[0] == 0 ? 0 : 1);
   unsigned Idx = WhichResult * NumElts / 2;
   for (unsigned i = 0; i != NumElts; i += 2) {
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll b/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll
index 7ed0e59..2be8b01 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll
@@ -31,3 +31,29 @@ define <4 x i32> @widen_shuffles_reduced(<3 x i32> %x, <3 x i32> %y) {
   %s3 = shufflevector <3 x i32> %y, <3 x i32> %x, <4 x i32> <i32 1, i32 4, i32 3, i32 0>
   ret <4 x i32> %s3
 }
+
+define void @zip_mask_check(<3 x float>* %p1, <3 x float>* %p2, i32* %p3) {
+; CHECK-LABEL: zip_mask_check:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fmla v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fmla v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    str s0, [x2]
+; CHECK-NEXT:    ret
+  %tmp3 = load <3 x float>, <3 x float>* %p1, align 16
+  %tmp4 = load <3 x float>, <3 x float>* %p2, align 4
+  %tmp5 = shufflevector <3 x float> %tmp3, <3 x float> %tmp4, <4 x i32> <i32 1, i32 4, i32 undef, i32 undef>
+  %tmp6 = shufflevector <4 x float> %tmp5, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+  %tmp7 = shufflevector <4 x float> %tmp6, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+  %tmp8 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp7, <4 x float> undef, <4 x float> undef)
+  %tmp9 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> %tmp8)
+  %tmp10 = shufflevector <4 x float> %tmp9, <4 x float> undef, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp11 = bitcast <16 x float> %tmp10 to <16 x i32>
+  %tmp12 = extractelement <16 x i32> %tmp11, i32 0
+  store i32 %tmp12, i32* %p3, align 4
+  ret void
+}
+
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
-- 
2.7.4