From 0b38af89e2c0adc13a6efb1dd04485229ef0d1c6 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 21 Dec 2019 10:33:58 -0500
Subject: [PATCH] [AArch64] match splat of bitcasted extract subvector to
 DUPLANE

This is another potential regression exposed by D63815.

Here we peek through a bitcast to find an extract subvector and
scale the splat offset based on that:
splat (bitcast (extract X, C)), LaneC --> duplane (bitcast X), LaneC'

Differential Revision: https://reviews.llvm.org/D71672
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 50 +++++++++++++++++++++----
 llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll  | 16 ++++----
 2 files changed, 51 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a3dd2e6..d01979d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7086,19 +7086,55 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     // Otherwise, duplicate from the lane of the input vector.
     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
 
-    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
-    // to make a vector of the same size as this SHUFFLE. We can ignore the
-    // extract entirely, and canonicalise the concat using WidenVector.
-    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+    // Try to eliminate a bitcasted extract subvector before a DUPLANE.
+    auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
+      // Match: dup (bitcast (extract_subv X, C)), LaneC
+      if (BitCast.getOpcode() != ISD::BITCAST ||
+          BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
+        return false;
+
+      // The extract index must align in the destination type. That may not
+      // happen if the bitcast is from narrow to wide type.
+      SDValue Extract = BitCast.getOperand(0);
+      unsigned ExtIdx = Extract.getConstantOperandVal(1);
+      unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
+      unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
+      unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
+      if (ExtIdxInBits % CastedEltBitWidth != 0)
+        return false;
+
+      // Update the lane value by offsetting with the scaled extract index.
+      LaneC += ExtIdxInBits / CastedEltBitWidth;
+
+      // Determine the casted vector type of the wide vector input.
+      // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
+      // Examples:
+      // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
+      // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
+      unsigned SrcVecNumElts =
+          Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
+      CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
+                                SrcVecNumElts);
+      return true;
+    };
+    MVT CastVT;
+    if (getScaledOffsetDup(V1, Lane, CastVT)) {
+      V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0));
+    } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      // The lane is incremented by the index of the extract.
+      // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
+      Lane += V1.getConstantOperandVal(1);
       V1 = V1.getOperand(0);
     } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+      // The lane is decremented if we are splatting from the 2nd operand.
+      // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
       unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
       Lane -= Idx * VT.getVectorNumElements() / 2;
       V1 = WidenVector(V1.getOperand(Idx), DAG);
-    } else if (VT.getSizeInBits() == 64)
+    } else if (VT.getSizeInBits() == 64) {
+      // Widen the operand to 128-bit register with undef.
       V1 = WidenVector(V1, DAG);
-
+    }
     return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
   }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index 47d82a1..2682678 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -1663,8 +1663,7 @@ entry:
 define <2 x float> @test_vmul_laneq3_f32_bitcast(<2 x float> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmul_laneq3_f32_bitcast:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
 ; CHECK-NEXT:    ret
   %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
   %bc = bitcast <1 x double> %extract to <2 x float>
@@ -1676,8 +1675,7 @@ define <2 x float> @test_vmul_laneq3_f32_bitcast(<2 x float> %a, <2 x double> %v
 define <2 x float> @test_vmul_laneq2_f32_bitcast(<2 x float> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vmul_laneq2_f32_bitcast:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[2]
 ; CHECK-NEXT:    ret
   %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
   %bc = bitcast <1 x double> %extract to <2 x float>
@@ -1689,8 +1687,7 @@ define <2 x float> @test_vmul_laneq2_f32_bitcast(<2 x float> %a, <2 x double> %v
 define <4 x i16> @test_vadd_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) {
 ; CHECK-LABEL: test_vadd_laneq5_i16_bitcast:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    dup v1.4h, v1.h[1]
+; CHECK-NEXT:    dup v1.4h, v1.h[5]
 ; CHECK-NEXT:    add v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    ret
   %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
@@ -1700,6 +1697,8 @@ define <4 x i16> @test_vadd_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) {
   ret <4 x i16> %r
 }
 
+; TODO: The pattern in LowerVECTOR_SHUFFLE does not match what we are looking for.
+
 define <4 x i16> @test_vadd_lane2_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) {
 ; CHECK-LABEL: test_vadd_lane2_i16_bitcast_bigger_aligned:
 ; CHECK:       // %bb.0:
@@ -1717,8 +1716,7 @@ define <4 x i16> @test_vadd_lane2_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x
 define <4 x i16> @test_vadd_lane5_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) {
 ; CHECK-LABEL: test_vadd_lane5_i16_bitcast_bigger_aligned:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    dup v1.4h, v1.h[1]
+; CHECK-NEXT:    dup v1.4h, v1.h[5]
 ; CHECK-NEXT:    add v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    ret
   %extract = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1728,6 +1726,8 @@ define <4 x i16> @test_vadd_lane5_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x
   ret <4 x i16> %r
 }
 
+; Negative test - can't dup bytes {3,4} of v8i16.
+
 define <4 x i16> @test_vadd_lane_i16_bitcast_bigger_unaligned(<4 x i16> %a, <16 x i8> %v) {
 ; CHECK-LABEL: test_vadd_lane_i16_bitcast_bigger_unaligned:
 ; CHECK:       // %bb.0:
-- 
2.7.4