From 058cd8c5be1cb6c74c0cfc154a61ea859e1c8811 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 24 Apr 2020 07:55:53 +0100
Subject: [PATCH] [CodeGen] Add support for inserting elements into scalable
 vectors

Summary:
This patch tries to ensure that we do something sensible when
generating code for the ISD::INSERT_VECTOR_ELT DAG node when operating
on scalable vectors. Previously we always returned 'undef' when
inserting an element into an out-of-bounds lane index, whereas now
we only do this for fixed length vectors. For scalable vectors it
is assumed that the backend will do the right thing in the same way
that we have to deal with variable lane indices.

In this patch I have permitted a few basic combinations for scalable
vector types where it makes sense, but in general avoided most cases
for now as they currently require the use of BUILD_VECTOR nodes.

This patch includes tests for all scalable vector types when inserting
into lane 0, but I've only included one or two vector types for other
cases such as variable lane inserts.

Differential Revision: https://reviews.llvm.org/D78992
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp   |  17 ++-
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp  |   7 +-
 llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td  |  71 +++++++++++++
 llvm/test/CodeGen/AArch64/sve-insert-element.ll | 135 ++++++++++++++++++++++++
 4 files changed, 224 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-insert-element.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1a8242a..9042f51 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17134,11 +17134,11 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   SDLoc DL(N);
 
   EVT VT = InVec.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
   auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
 
   // Insert into out-of-bounds element is undefined.
-  if (IndexC && IndexC->getZExtValue() >= VT.getVectorNumElements())
+  if (IndexC && VT.isFixedLengthVector() &&
+      IndexC->getZExtValue() >= VT.getVectorNumElements())
     return DAG.getUNDEF(VT);
 
   // Remove redundant insertions:
@@ -17151,12 +17151,21 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     // If this is variable insert to undef vector, it might be better to splat:
     // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
     if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) {
-      SmallVector<SDValue, 8> Ops(NumElts, InVal);
-      return DAG.getBuildVector(VT, DL, Ops);
+      if (VT.isScalableVector())
+        return DAG.getSplatVector(VT, DL, InVal);
+      else {
+        SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal);
+        return DAG.getBuildVector(VT, DL, Ops);
+      }
     }
     return SDValue();
   }
 
+  if (VT.isScalableVector())
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+
   // We must know which element is being inserted for folds below here.
   unsigned Elt = IndexC->getZExtValue();
   if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4d89bee..7929682 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5615,8 +5615,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     llvm_unreachable("should use getVectorShuffle constructor!");
   case ISD::INSERT_VECTOR_ELT: {
     ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3);
-    // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF
-    if (N3C && N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
+    // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except
+    // for scalable vectors where we will generate appropriate code to
+    // deal with out-of-bounds cases correctly.
+    if (N3C && N1.getValueType().isFixedLengthVector() &&
+        N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
       return getUNDEF(VT);
 
     // Undefined index can be assumed out-of-bounds, so that's UNDEF too.
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 7864cc1f..ed79063 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1767,6 +1767,77 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   // 16-element contiguous store
   defm : st1<ST1B, ST1B_IMM,   nxv16i8, AArch64st1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
 
+  def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)),
+            (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+  def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)),
+            (INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+  def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)),
+            (INSERT_SUBREG (nxv4i32 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+  def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)),
+            (INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+  // Insert scalar into vector[0]
+  def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)),
+            (CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>;
+  def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)),
+            (CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>;
+  def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)),
+            (CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>;
+  def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)),
+            (CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>;
+
+  def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)),
+            (SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>;
+  def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)),
+            (SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>;
+  def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)),
+            (SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>;
+
+  // Insert scalar into vector with scalar index
+  def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)),
+            (CPY_ZPmR_B ZPR:$vec,
+                        (CMPEQ_PPzZZ_B (PTRUE_B 31),
+                                       (INDEX_II_B 0, 1),
+                                       (DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        GPR32:$src)>;
+  def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)),
+            (CPY_ZPmR_H ZPR:$vec,
+                        (CMPEQ_PPzZZ_H (PTRUE_H 31),
+                                       (INDEX_II_H 0, 1),
+                                       (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        GPR32:$src)>;
+  def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)),
+            (CPY_ZPmR_S ZPR:$vec,
+                        (CMPEQ_PPzZZ_S (PTRUE_S 31),
+                                       (INDEX_II_S 0, 1),
+                                       (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        GPR32:$src)>;
+  def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)),
+            (CPY_ZPmR_D ZPR:$vec,
+                        (CMPEQ_PPzZZ_D (PTRUE_D 31),
+                                       (INDEX_II_D 0, 1),
+                                       (DUP_ZR_D GPR64:$index)),
+                        GPR64:$src)>;
+
+  // Insert FP scalar into vector with scalar index
+  def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
+            (CPY_ZPmV_H ZPR:$vec,
+                        (CMPEQ_PPzZZ_H (PTRUE_H 31),
+                                       (INDEX_II_H 0, 1),
+                                       (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        $src)>;
+  def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)),
+            (CPY_ZPmV_S ZPR:$vec,
+                        (CMPEQ_PPzZZ_S (PTRUE_S 31),
+                                       (INDEX_II_S 0, 1),
+                                       (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        $src)>;
+  def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)),
+            (CPY_ZPmV_D ZPR:$vec,
+                        (CMPEQ_PPzZZ_D (PTRUE_D 31),
+                                       (INDEX_II_D 0, 1),
+                                       (DUP_ZR_D $index)),
+                        $src)>;
 }
 
 let Predicates = [HasSVE, HasMatMulInt8] in {
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
new file mode 100644
index 0000000..90acf8c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
@@ -0,0 +1,135 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 16 x i8> @test_lane0_16xi8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_lane0_16xi8
+; CHECK:       mov [[REG:.*]], #30
+; CHECK:       mov z0.b, p{{[0-7]}}/m, [[REG]]
+  %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 0
+  ret <vscale x 16 x i8> %b
+}
+
+define <vscale x 8 x i16> @test_lane0_8xi16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: test_lane0_8xi16
+; CHECK:       mov [[REG:.*]], #30
+; CHECK:       mov z0.h, p{{[0-7]}}/m, [[REG]]
+  %b = insertelement <vscale x 8 x i16> %a, i16 30, i32 0
+  ret <vscale x 8 x i16> %b
+}
+
+define <vscale x 4 x i32> @test_lane0_4xi32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: test_lane0_4xi32
+; CHECK:       mov [[REG:.*]], #30
+; CHECK:       mov z0.s, p{{[0-7]}}/m, [[REG]]
+  %b = insertelement <vscale x 4 x i32> %a, i32 30, i32 0
+  ret <vscale x 4 x i32> %b
+}
+
+define <vscale x 2 x i64> @test_lane0_2xi64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: test_lane0_2xi64
+; CHECK:       mov w[[REG:.*]], #30
+; CHECK:       mov z0.d, p{{[0-7]}}/m, x[[REG]]
+  %b = insertelement <vscale x 2 x i64> %a, i64 30, i32 0
+  ret <vscale x 2 x i64> %b
+}
+
+define <vscale x 2 x double> @test_lane0_2xf64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: test_lane0_2xf64
+; CHECK:       fmov d[[REG:[0-9]+]], #1.00000000
+; CHECK:       mov z0.d, p{{[0-7]}}/m, z[[REG]].d
+  %b = insertelement <vscale x 2 x double> %a, double 1.0, i32 0
+  ret <vscale x 2 x double> %b
+}
+
+define <vscale x 4 x float> @test_lane0_4xf32(<vscale x 4 x float> %a) {
+; CHECK-LABEL: test_lane0_4xf32
+; CHECK:       fmov s[[REG:[0-9]+]], #1.00000000
+; CHECK:       mov z0.s, p{{[0-7]}}/m, z[[REG]].s
+  %b = insertelement <vscale x 4 x float> %a, float 1.0, i32 0
+  ret <vscale x 4 x float> %b
+}
+
+define <vscale x 8 x half> @test_lane0_8xf16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: test_lane0_8xf16
+; CHECK:       fmov h[[REG:[0-9]+]], #1.00000000
+; CHECK:       mov z0.h, p{{[0-7]}}/m, z[[REG]].h
+  %b = insertelement <vscale x 8 x half> %a, half 1.0, i32 0
+  ret <vscale x 8 x half> %b
+}
+
+; Undefined lane insert
+define <vscale x 2 x i64> @test_lane4_2xi64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: test_lane4_2xi64
+; CHECK:       mov w[[IDXREG:.*]], #4
+; CHECK:       index z[[CMPVEC:[0-9]+]].d, #0, #1
+; CHECK:       mov z[[IDXVEC:[0-9]+]].d, x[[IDXREG]]
+; CHECK:       cmpeq p[[PRED:[0-9]+]].d, p{{[0-7]}}/z, z[[CMPVEC]].d, z[[IDXVEC]].d
+; CHECK:       mov w[[VALREG:.*]], #30
+; CHECK:       mov z0.d, p[[PRED]]/m, x[[VALREG]]
+  %b = insertelement <vscale x 2 x i64> %a, i64 30, i32 4
+  ret <vscale x 2 x i64> %b
+}
+
+; Undefined lane insert
+define <vscale x 8 x half> @test_lane9_8xf16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: test_lane9_8xf16
+; CHECK:       mov w[[IDXREG:.*]], #9
+; CHECK:       index z[[CMPVEC:[0-9]+]].h, #0, #1
+; CHECK:       mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]]
+; CHECK:       cmpeq p[[PRED:[0-9]+]].h, p{{[0-7]}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h
+; CHECK:       fmov h[[VALREG:[0-9]+]], #1.00000000
+; CHECK:       mov z0.h, p[[PRED]]/m, h[[VALREG]]
+  %b = insertelement <vscale x 8 x half> %a, half 1.0, i32 9
+  ret <vscale x 8 x half> %b
+}
+
+define <vscale x 16 x i8> @test_lane1_16xi8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: test_lane1_16xi8
+; CHECK:       mov w[[IDXREG:.*]], #1
+; CHECK:       index z[[CMPVEC:[0-9]+]].b, #0, #1
+; CHECK:       mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]]
+; CHECK:       cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b
+; CHECK:       mov w[[VALREG:.*]], #30
+; CHECK:       mov z0.b, p[[PRED]]/m, w[[VALREG]]
+  %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 1
+  ret <vscale x 16 x i8> %b
+}
+
+define <vscale x 16 x i8> @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
+; CHECK-LABEL: test_lanex_16xi8
+; CHECK:       index z[[CMPVEC:[0-9]+]].b, #0, #1
+; CHECK:       mov z[[IDXVEC:[0-9]+]].b, w[[IDXREG]]
+; CHECK:       cmpeq p[[PRED:[0-9]+]].b, p{{[0-7]}}/z, z[[CMPVEC]].b, z[[IDXVEC]].b
+; CHECK:       mov w[[VALREG:.*]], #30
+; CHECK:       mov z0.b, p[[PRED]]/m, w[[VALREG]]
+  %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 %x
+  ret <vscale x 16 x i8> %b
+}
+
+
+; Redundant lane insert
+define <vscale x 4 x i32> @extract_insert_4xi32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: extract_insert_4xi32
+; CHECK-NOT:   mov w{{.*}}, #30
+; CHECK-NOT:   mov z0.d
+  %b = extractelement <vscale x 4 x i32> %a, i32 2
+  %c = insertelement <vscale x 4 x i32> %a, i32 %b, i32 2
+  ret <vscale x 4 x i32> %c
+}
+
+define <vscale x 8 x i16> @test_lane6_undef_8xi16(i16 %a) {
+; CHECK-LABEL: test_lane6_undef_8xi16
+; CHECK:       mov w[[IDXREG:.*]], #6
+; CHECK:       index z[[CMPVEC:.*]].h, #0, #1
+; CHECK:       mov z[[IDXVEC:[0-9]+]].h, w[[IDXREG]]
+; CHECK:       cmpeq p[[PRED:.*]].h, p{{.*}}/z, z[[CMPVEC]].h, z[[IDXVEC]].h
+; CHECK:       mov z0.h, p[[PRED]]/m, w0
+  %b = insertelement <vscale x 8 x i16> undef, i16 %a, i32 6
+  ret <vscale x 8 x i16> %b
+}
+
+define <vscale x 16 x i8> @test_lane0_undef_16xi8(i8 %a) {
+; CHECK-LABEL: test_lane0_undef_16xi8
+; CHECK:       fmov s0, w0
+  %b = insertelement <vscale x 16 x i8> undef, i8 %a, i32 0
+  ret <vscale x 16 x i8> %b
+}
-- 
2.7.4