From 8d70e6425c7b782729ed9461d11928ca0f045fd2 Mon Sep 17 00:00:00 2001
From: Amara Emerson <aemerson@apple.com>
Date: Thu, 28 Feb 2019 18:47:29 +0000
Subject: [PATCH] Revert "[AArch64][GlobalISel] Add support for 64 bit vector
 shuffle using TBL1."

Seems to break some neon intrinsics tests.

llvm-svn: 355115
---
 .../Target/AArch64/AArch64InstructionSelector.cpp  | 144 ++++-----------------
 .../AArch64/GlobalISel/select-shuffle-vector.mir   |  72 ++++-------
 2 files changed, 51 insertions(+), 165 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
index ebc2d9c..83d61c8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -82,8 +82,6 @@ private:
   unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
   MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
                                          MachineIRBuilder &MIRBuilder) const;
-  MachineInstr *emitVectorConcat(unsigned Op1, unsigned Op2,
-                                 MachineIRBuilder &MIRBuilder) const;
 
   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
 
@@ -1967,98 +1965,6 @@ MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
   return &*Load;
 }
 
-/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
-/// size and RB.
-static std::pair<unsigned, unsigned>
-getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
-  unsigned Opc, SubregIdx;
-  if (RB.getID() == AArch64::GPRRegBankID) {
-    if (EltSize == 32) {
-      Opc = AArch64::INSvi32gpr;
-      SubregIdx = AArch64::ssub;
-    } else if (EltSize == 64) {
-      Opc = AArch64::INSvi64gpr;
-      SubregIdx = AArch64::dsub;
-    } else {
-      llvm_unreachable("invalid elt size!");
-    }
-  } else {
-    if (EltSize == 8) {
-      Opc = AArch64::INSvi8lane;
-      SubregIdx = AArch64::bsub;
-    } else if (EltSize == 16) {
-      Opc = AArch64::INSvi16lane;
-      SubregIdx = AArch64::hsub;
-    } else if (EltSize == 32) {
-      Opc = AArch64::INSvi32lane;
-      SubregIdx = AArch64::ssub;
-    } else if (EltSize == 64) {
-      Opc = AArch64::INSvi64lane;
-      SubregIdx = AArch64::dsub;
-    } else {
-      llvm_unreachable("invalid elt size!");
-    }
-  }
-  return std::make_pair(Opc, SubregIdx);
-}
-
-MachineInstr *AArch64InstructionSelector::emitVectorConcat(
-    unsigned Op1, unsigned Op2, MachineIRBuilder &MIRBuilder) const {
-  // We implement a vector concat by:
-  // 1. Use scalar_to_vector to insert the lower vector into the larger dest
-  // 2. Insert the upper vector into the destination's upper element
-  // TODO: some of this code is common with G_BUILD_VECTOR handling.
-  MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
-
-  const LLT Op1Ty = MRI.getType(Op1);
-  const LLT Op2Ty = MRI.getType(Op2);
-
-  if (Op1Ty != Op2Ty) {
-    LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
-    return nullptr;
-  }
-  assert(Op1Ty.isVector() && "Expected a vector for vector concat");
-
-  if (Op1Ty.getSizeInBits() >= 128) {
-    LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
-    return nullptr;
-  }
-
-  // At the moment we just support 64 bit vector concats.
-  if (Op1Ty.getSizeInBits() != 64) {
-    LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
-    return nullptr;
-  }
-
-  const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
-  const LLT &DstTy = LLT::vector(2, ScalarTy);
-  const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
-  const TargetRegisterClass *DstRC =
-      getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
-
-  MachineInstr *WidenedOp1 = emitScalarToVector(DstTy, DstRC, Op1, MIRBuilder);
-  MachineInstr *WidenedOp2 = emitScalarToVector(DstTy, DstRC, Op2, MIRBuilder);
-  if (!WidenedOp1 || !WidenedOp2) {
-    LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
-    return nullptr;
-  }
-
-  // Now do the insert of the upper element.
-  unsigned InsertOpc, InsSubRegIdx;
-  std::tie(InsertOpc, InsSubRegIdx) =
-      getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
-
-  auto InsElt =
-      MIRBuilder
-          .buildInstr(InsertOpc, {DstRC}, {WidenedOp1->getOperand(0).getReg()})
-          .addImm(1) /* Lane index */
-          .addUse(WidenedOp2->getOperand(0).getReg())
-          .addImm(0);
-
-  constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
-  return &*InsElt;
-}
-
 bool AArch64InstructionSelector::selectShuffleVector(
     MachineInstr &I, MachineRegisterInfo &MRI) const {
   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
@@ -2096,37 +2002,21 @@ bool AArch64InstructionSelector::selectShuffleVector(
     }
   }
 
-  MachineIRBuilder MIRBuilder(I);
+  if (DstTy.getSizeInBits() != 128) {
+    assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
+    // This case can be done with TBL1.
+    return false;
+  }
 
   // Use a constant pool to load the index vector for TBL.
   Constant *CPVal = ConstantVector::get(CstIdxs);
+  MachineIRBuilder MIRBuilder(I);
   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
   if (!IndexLoad) {
     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
     return false;
   }
 
-  if (DstTy.getSizeInBits() != 128) {
-    assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
-    // This case can be done with TBL1.
-    MachineInstr *Concat = emitVectorConcat(Src1Reg, Src2Reg, MIRBuilder);
-    if (!Concat) {
-      LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
-      return false;
-    }
-    auto TBL1 = MIRBuilder.buildInstr(
-        AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
-        {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
-    constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
-
-    auto Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
-                        TII.get(TargetOpcode::COPY), I.getOperand(0).getReg())
-                    .addUse(TBL1->getOperand(0).getReg(), 0, AArch64::dsub);
-    RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
-    I.eraseFromParent();
-    return true;
-  }
-
   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
   // Q registers for regalloc.
   auto RegSeq = MIRBuilder
@@ -2158,8 +2048,26 @@ bool AArch64InstructionSelector::selectBuildVector(
   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
   unsigned Opc;
   unsigned SubregIdx;
-
-  std::tie(Opc, SubregIdx) = getInsertVecEltOpInfo(RB, EltSize);
+  if (RB.getID() == AArch64::GPRRegBankID) {
+    if (EltSize == 32) {
+      Opc = AArch64::INSvi32gpr;
+      SubregIdx = AArch64::ssub;
+    } else {
+      Opc = AArch64::INSvi64gpr;
+      SubregIdx = AArch64::dsub;
+    }
+  } else {
+    if (EltSize == 16) {
+      Opc = AArch64::INSvi16lane;
+      SubregIdx = AArch64::hsub;
+    } else if (EltSize == 32) {
+      Opc = AArch64::INSvi32lane;
+      SubregIdx = AArch64::ssub;
+    } else {
+      Opc = AArch64::INSvi64lane;
+      SubregIdx = AArch64::dsub;
+    }
+  }
 
   MachineIRBuilder MIRBuilder(I);
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir
index f03e480..b78c7a5 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir
@@ -1,17 +1,11 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# WARNING: update_mir_test_checks.py does not include the constant pools output,
-# so this test requires manual fixing up after running the script.
-
 # RUN: llc -mtriple=aarch64-- -O0 -run-pass=instruction-select -verify-machineinstrs %s -global-isel-abort=1 -o - | FileCheck %s
 --- |
+  ; ModuleID = 'shufflevec-only-legal.ll'
+  source_filename = "shufflevec-only-legal.ll"
   target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
   target triple = "aarch64"
 
-  define <2 x float> @shuffle_v2f32(<2 x float> %a, <2 x float> %b) {
-    %shuf = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 0>
-    ret <2 x float> %shuf
-  }
-
   define <4 x i32> @shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
     %shuf = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 3, i32 0>
     ret <4 x i32> %shuf
@@ -29,50 +23,19 @@
 
 ...
 ---
-name:            shuffle_v2f32
-alignment:       2
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-body:             |
-  bb.1 (%ir-block.0):
-    liveins: $d0, $d1
-
-    ; CHECK-LABEL: name: shuffle_v2f32
-    ; CHECK: constants:
-    ; CHECK:  - id:              0
-    ; CHECK:    value:           '<8 x i8> <i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3>'
-    ; CHECK:    alignment:       8
-    ; CHECK: liveins: $d0, $d1
-    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
-    ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
-    ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
-    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub
-    ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.dsub
-    ; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0
-    ; CHECK: [[TBLv16i8One:%[0-9]+]]:fpr128 = TBLv16i8One [[INSvi64lane]], [[LDRQui]]
-    ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY [[TBLv16i8One]].dsub
-    ; CHECK: $d0 = COPY [[COPY2]]
-    ; CHECK: RET_ReallyLR implicit $d0
-    %0:fpr(<2 x s32>) = COPY $d0
-    %1:fpr(<2 x s32>) = COPY $d1
-    %4:gpr(s32) = G_CONSTANT i32 1
-    %5:gpr(s32) = G_CONSTANT i32 0
-    %3:fpr(<2 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32)
-    %2:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %1, %3(<2 x s32>)
-    $d0 = COPY %2(<2 x s32>)
-    RET_ReallyLR implicit $d0
-
-...
----
 name:            shuffle_v4i32
 alignment:       2
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: gpr }
+  - { id: 5, class: gpr }
+  - { id: 6, class: gpr }
 body:             |
   bb.1 (%ir-block.0):
     liveins: $q0, $q1
@@ -108,6 +71,15 @@ alignment:       2
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: gpr }
+  - { id: 5, class: gpr }
+  - { id: 6, class: gpr }
+  - { id: 7, class: gpr }
 body:             |
   bb.1 (%ir-block.0):
     liveins: $q0, $q1
@@ -144,6 +116,12 @@ alignment:       2
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: gpr }
 body:             |
   bb.1 (%ir-block.0):
     liveins: $q0, $q1
-- 
2.7.4