From 83288f8063377f03cbcf3e89c940d2a62c855a96 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Mon, 26 Dec 2022 21:46:20 +0300
Subject: [PATCH] [AArch64] Custom lower `ISD::ZERO_EXTEND_VECTOR_INREG`

The baseline legalization for `ISD::ZERO_EXTEND_VECTOR_INREG`
(`VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG`),
blends-in the zeros, but as mentioned e.g.
in b4bd0a404fe26071dab0854dfd9767974909c7c4,
there is no such thing for AArch64.

So some of the shuffles that would be nicely lowered
by `LowerVECTOR_SHUFFLE()`, e.g. into `ZIP1`,
would now be unrecognizable after round-tripping
through `ISD::ZERO_EXTEND_VECTOR_INREG` recognition & legalization.

The most obvious solution is to just custom-lower
`ISD::ZERO_EXTEND_VECTOR_INREG` as the `ZIP1`-with-zeros,
like it would have been originally in that test case.
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 24 ++++++++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h   |  1 +
 llvm/test/CodeGen/AArch64/aarch64-vuzp.ll       |  7 ++-----
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2627e44..d65693f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1554,6 +1554,7 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
   setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+  setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   setOperationAction(ISD::SRA, VT, Custom);
@@ -5919,6 +5920,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::BUILD_VECTOR:
     return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::SPLAT_VECTOR:
@@ -11443,6 +11446,27 @@ static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
                       Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
 }
 
+// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
+// but we don't have an appropriate instruction,
+// so custom-lower it as ZIP1-with-zeros.
+SDValue
+AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  SDValue SrcOp = Op.getOperand(0);
+  EVT SrcVT = SrcOp.getValueType();
+  assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
+         "Unexpected extension factor.");
+  unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
+  // FIXME: support multi-step zipping?
+  if (Scale != 2)
+    return SDValue();
+  SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
+  return DAG.getBitcast(VT,
+                        DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
+}
+
 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDLoc dl(Op);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1eac0af..e62a1bf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1039,6 +1039,7 @@ private:
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
index 1032699..c1d9cae 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
@@ -59,10 +59,8 @@ entry:
 
 ; Check that this pattern is recognized as a VZIP and
 ; that the vector blend transform does not scramble the pattern.
-; FIXME: we can not recognize generic ZERO_EXTEND_VECTOR_INREG legalization
-;        as a zip1.
 ; CHECK-LABEL: vzipNoBlend:
-; CHECK-NOT: zip1
+; CHECK: zip1
 define <8 x i8> @vzipNoBlend(ptr %A, ptr %B) nounwind {
   %t = load <8 x i8>, ptr %A
   %vzip = shufflevector <8 x i8> %t, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -78,9 +76,8 @@ define <8 x i8> @vzipNoBlendCommutted(ptr %A, ptr %B) nounwind {
   ret <8 x i8> %vzip
 }
 
-; FIXME: this is identical to @vzipNoBlend
 ; CHECK-LABEL: vzipStillZExt:
-; CHECK-NOT: zip1
+; CHECK: zip1
 define <8 x i8> @vzipStillZExt(ptr %A, ptr %B) nounwind {
   %t = load <8 x i8>, ptr %A
   %vzip = shufflevector <8 x i8> %t, <8 x i8> <i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i32> <i32 0, i32 9, i32 1, i32 9, i32 2, i32 9, i32 3, i32 9>
-- 
2.7.4