From f0ec9aff2a811dfaf5158bbcd3c2b7a300c56519 Mon Sep 17 00:00:00 2001
From: Kevin Qin <Kevin.Qin@arm.com>
Date: Wed, 18 Jun 2014 05:54:42 +0000
Subject: [PATCH] [AArch64] Fix a pattern match failure caused by creating
 improper CONCAT_VECTOR.

ReconstructShuffle() may wrongly creat a CONCAT_VECTOR trying to
concat 2 of v2i32 into v4i16. This commit is to fix this issue and
try to generate UZP1 instead of lots of MOV and INS.
Patch is initalized by Kevin Qin, and refactored by Tim Northover.

llvm-svn: 211144
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp  | 66 ++++++++++++++----------
 llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll | 33 ++++++++++++
 2 files changed, 72 insertions(+), 27 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ec10968..e45ca4d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4110,6 +4110,7 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
 // shuffle in combination with VEXTs.
 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
                                                   SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
@@ -4158,35 +4159,47 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
 
   SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
   int VEXTOffsets[2] = { 0, 0 };
+  int OffsetMultipliers[2] = { 1, 1 };
 
   // This loop extracts the usage patterns of the source vectors
   // and prepares appropriate SDValues for a shuffle if possible.
   for (unsigned i = 0; i < SourceVecs.size(); ++i) {
-    if (SourceVecs[i].getValueType() == VT) {
+    unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements();
+    SDValue CurSource = SourceVecs[i];
+    if (SourceVecs[i].getValueType().getVectorElementType() !=
+        VT.getVectorElementType()) {
+      // It may hit this case if SourceVecs[i] is AssertSext/AssertZext.
+      // Then bitcast it to the vector which holds asserted element type,
+      // and record the multiplier of element width between SourceVecs and
+      // Build_vector which is needed to extract the correct lanes later.
+      EVT CastVT =
+          EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                           SourceVecs[i].getValueSizeInBits() /
+                               VT.getVectorElementType().getSizeInBits());
+
+      CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]);
+      OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts;
+      NumSrcElts *= OffsetMultipliers[i];
+      MaxElts[i] *= OffsetMultipliers[i];
+      MinElts[i] *= OffsetMultipliers[i];
+    }
+
+    if (CurSource.getValueType() == VT) {
       // No VEXT necessary
-      ShuffleSrcs[i] = SourceVecs[i];
+      ShuffleSrcs[i] = CurSource;
       VEXTOffsets[i] = 0;
       continue;
-    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+    } else if (NumSrcElts < NumElts) {
       // We can pad out the smaller vector for free, so if it's part of a
       // shuffle...
-      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i],
-                                   DAG.getUNDEF(SourceVecs[i].getValueType()));
+      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource,
+                                   DAG.getUNDEF(CurSource.getValueType()));
       continue;
     }
 
-    // Don't attempt to extract subvectors from BUILD_VECTOR sources
-    // that expand or trunc the original value.
-    // TODO: We can try to bitcast and ANY_EXTEND the result but
-    // we need to consider the cost of vector ANY_EXTEND, and the
-    // legality of all the types.
-    if (SourceVecs[i].getValueType().getVectorElementType() !=
-        VT.getVectorElementType())
-      return SDValue();
-
     // Since only 64-bit and 128-bit vectors are legal on ARM and
     // we've eliminated the other cases...
-    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
+    assert(NumSrcElts == 2 * NumElts &&
            "unexpected vector sizes in ReconstructShuffle");
 
     if (MaxElts[i] - MinElts[i] >= NumElts) {
@@ -4197,22 +4210,20 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     if (MinElts[i] >= NumElts) {
       // The extraction can just take the second half
       VEXTOffsets[i] = NumElts;
-      ShuffleSrcs[i] =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                   DAG.getIntPtrConstant(NumElts));
     } else if (MaxElts[i] < NumElts) {
       // The extraction can just take the first half
       VEXTOffsets[i] = 0;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                   SourceVecs[i], DAG.getIntPtrConstant(0));
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                   DAG.getIntPtrConstant(0));
     } else {
       // An actual VEXT is needed
       VEXTOffsets[i] = MinElts[i];
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                     SourceVecs[i], DAG.getIntPtrConstant(0));
-      SDValue VEXTSrc2 =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                     DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
+                                     DAG.getIntPtrConstant(NumElts));
       unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
       ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
                                    DAG.getConstant(Imm, MVT::i32));
@@ -4232,9 +4243,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     int ExtractElt =
         cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
     if (ExtractVec == SourceVecs[0]) {
-      Mask.push_back(ExtractElt - VEXTOffsets[0]);
+      Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]);
     } else {
-      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
+      Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts -
+                     VEXTOffsets[1]);
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
new file mode 100644
index 0000000..6bfb0a5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+
+define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
+; CHECK: fptosi_v4f64_to_v4i16
+; CHECK-DAG: fcvtzs  v[[LHS:[0-9]+]].2d, v1.2d
+; CHECK-DAG: fcvtzs  v[[RHS:[0-9]+]].2d, v0.2d
+; CHECK-DAG: xtn  v[[LHS_NA:[0-9]+]].2s, v[[LHS]].2d
+; CHECK-DAG: xtn  v[[RHS_NA:[0-9]+]].2s, v[[RHS]].2d
+; CHECK:     uzp1  v0.4h, v[[RHS_NA]].4h, v[[LHS_NA]].4h
+  %tmp1 = load <4 x double>* %ptr
+  %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
+  ret <4 x i16> %tmp2
+}
+
+define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) {
+; CHECK: fptosi_v4f64_to_v4i8
+; CHECK-DAG:  fcvtzs  v[[CONV3:[0-9]+]].2d, v3.2d
+; CHECK-DAG:  fcvtzs  v[[CONV2:[0-9]+]].2d, v2.2d
+; CHECK-DAG:  fcvtzs  v[[CONV1:[0-9]+]].2d, v1.2d
+; CHECK-DAG:  fcvtzs  v[[CONV0:[0-9]+]].2d, v0.2d
+; CHECK-DAG:  xtn  v[[NA3:[0-9]+]].2s, v[[CONV3]].2d
+; CHECK-DAG:  xtn  v[[NA2:[0-9]+]].2s, v[[CONV2]].2d
+; CHECK-DAG:  xtn  v[[NA1:[0-9]+]].2s, v[[CONV1]].2d
+; CHECK-DAG:  xtn  v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
+; CHECK-DAG:  uzp1  v[[TMP1:[0-9]+]].4h, v[[CONV2]].4h, v[[CONV3]].4h
+; CHECK-DAG:  uzp1  v[[TMP2:[0-9]+]].4h, v[[CONV0]].4h, v[[CONV1]].4h
+; CHECK:      uzp1  v0.8b, v[[TMP2]].8b, v[[TMP1]].8b
+  %tmp1 = load <8 x double>* %ptr
+  %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
+  ret <8 x i8> %tmp2
+}
+
-- 
2.7.4