[ARM] Split 128-bit vectors in BUILD_VECTOR lowering

author Eli Friedman <efriedma@codeaurora.org>

Wed, 14 Dec 2016 20:44:38 +0000 (20:44 +0000)

committer Eli Friedman <efriedma@codeaurora.org>

Wed, 14 Dec 2016 20:44:38 +0000 (20:44 +0000)
author Eli Friedman <efriedma@codeaurora.org>
Wed, 14 Dec 2016 20:44:38 +0000 (20:44 +0000)
committer Eli Friedman <efriedma@codeaurora.org>
Wed, 14 Dec 2016 20:44:38 +0000 (20:44 +0000)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp

index 57beb42..3ddb66e 100644 (file)
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -6053,6 +6053,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
    unsigned SplatBitSize;
    bool HasAnyUndefs;
    if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatUndef.isAllOnesValue())
+      return DAG.getUNDEF(VT);
+
      if (SplatBitSize <= 64) {
        // Check if an immediate VMOV works.
        EVT VmovVT;
@@ -6214,6 +6217,24 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
        return shuffle;
    }
  
+  if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
+    // If we haven't found an efficient lowering, try splitting a 128-bit vector
+    // into two 64-bit vectors; we might discover a better way to lower it.
+    SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
+    EVT ExtVT = VT.getVectorElementType();
+    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
+    SDValue Lower =
+        DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
+    if (Lower.getOpcode() == ISD::BUILD_VECTOR)
+      Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
+    SDValue Upper = DAG.getBuildVector(
+        HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
+    if (Upper.getOpcode() == ISD::BUILD_VECTOR)
+      Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
+    if (Lower && Upper)
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
+  }
+
    // Vectors with 32- or 64-bit elements can be built by directly assigning
    // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
    // will be legalized.
diff --git a/llvm/test/CodeGen/ARM/big-endian-vector-callee.ll b/llvm/test/CodeGen/ARM/big-endian-vector-callee.ll

index 4db8bde..331ef94 100644 (file)
--- a/llvm/test/CodeGen/ARM/big-endian-vector-callee.ll
+++ b/llvm/test/CodeGen/ARM/big-endian-vector-callee.ll
@@ -652,10 +652,10 @@ define fp128 @test_f128_v16i8(<16 x i8> %p) {
  
  ; CHECK-LABEL: test_v2f64_f128:
  define <2 x double> @test_v2f64_f128(fp128 %p) {
-; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
-; CHECK: vmov.32 [[REG1]][1], r1
  ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
  ; CHECK: vmov.32 [[REG2]][1], r3
+; CHECK: vmov.32 [[REG1]][1], r1
      %1 = fadd fp128 %p, %p
      %2 = bitcast fp128 %1 to <2 x double>
      %3 = fadd <2 x double> %2, %2
@@ -747,10 +747,10 @@ define <2 x double> @test_v2f64_v16i8(<16 x i8> %p) {
  
  ; CHECK-LABEL: test_v2i64_f128:
  define <2 x i64> @test_v2i64_f128(fp128 %p) {
-; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
-; CHECK: vmov.32 [[REG1]][1], r1
  ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
  ; CHECK: vmov.32 [[REG2]][1], r3
+; CHECK: vmov.32 [[REG1]][1], r1
      %1 = fadd fp128 %p, %p
      %2 = bitcast fp128 %1 to <2 x i64>
      %3 = add <2 x i64> %2, %2
@@ -827,10 +827,10 @@ define <2 x i64> @test_v2i64_v16i8(<16 x i8> %p) {
  
  ; CHECK-LABEL: test_v4f32_f128:
  define <4 x float> @test_v4f32_f128(fp128 %p) {
-; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
-; CHECK: vmov.32 [[REG1]][1], r1
  ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
  ; CHECK: vmov.32 [[REG2]][1], r3
+; CHECK: vmov.32 [[REG1]][1], r1
      %1 = fadd fp128 %p, %p
      %2 = bitcast fp128 %1 to <4 x float>
      %3 = fadd <4 x float> %2, %2
@@ -909,10 +909,10 @@ define <4 x float> @test_v4f32_v16i8(<16 x i8> %p) {
  
  ; CHECK-LABEL: test_v4i32_f128:
  define <4 x i32> @test_v4i32_f128(fp128 %p) {
-; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
-; CHECK: vmov.32 [[REG1]][1], r1
  ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
  ; CHECK: vmov.32 [[REG2]][1], r3
+; CHECK: vmov.32 [[REG1]][1], r1
      %1 = fadd fp128 %p, %p
      %2 = bitcast fp128 %1 to <4 x i32>
      %3 = add <4 x i32> %2, %2
@@ -997,10 +997,10 @@ define <4 x i32> @test_v4i32_v16i8(<16 x i8> %p) {
  
  ; CHECK-LABEL: test_v8i16_f128:
  define <8 x i16> @test_v8i16_f128(fp128 %p) {
-; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
-; CHECK: vmov.32 [[REG1]][1], r1
  ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
  ; CHECK: vmov.32 [[REG2]][1], r3
+; CHECK: vmov.32 [[REG1]][1], r1
      %1 = fadd fp128 %p, %p
      %2 = bitcast fp128 %1 to <8 x i16>
      %3 = add <8 x i16> %2, %2
@@ -1085,10 +1085,10 @@ define <8 x i16> @test_v8i16_v16i8(<16 x i8> %p) {
  
  ; CHECK-LABEL: test_v16i8_f128:
  define <16 x i8> @test_v16i8_f128(fp128 %p) {
-; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
-; CHECK: vmov.32 [[REG1]][1], r1
  ; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
  ; CHECK: vmov.32 [[REG2]][1], r3
+; CHECK: vmov.32 [[REG1]][1], r1
      %1 = fadd fp128 %p, %p
      %2 = bitcast fp128 %1 to <16 x i8>
      %3 = add <16 x i8> %2, %2
diff --git a/llvm/test/CodeGen/ARM/vcombine.ll b/llvm/test/CodeGen/ARM/vcombine.ll

index fc17188..81b22ee 100644 (file)
--- a/llvm/test/CodeGen/ARM/vcombine.ll
+++ b/llvm/test/CodeGen/ARM/vcombine.ll
@@ -105,3 +105,21 @@ define <8 x i8> @vget_high8(<16 x i8>* %A) nounwind {
          %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
          ret <8 x i8> %tmp2
  }
+
+; vcombine(vld1_dup(p), vld1_dup(p2))
+define <8 x i16> @vcombine_vdup(<8 x i16> %src, i16* nocapture readonly %p) {
+; CHECK-LABEL: vcombine_vdup:
+; CHECK: vld1.16 {d16[]},
+; CHECK: vld1.16 {d17[]},
+; CHECK-LE: vmov    r0, r1, d16
+; CHECK-LE: vmov    r2, r3, d17
+  %a1 = load i16, i16* %p, align 2
+  %a2 = insertelement <4 x i16> undef, i16 %a1, i32 0
+  %a3 = shufflevector <4 x i16> %a2, <4 x i16> undef, <4 x i32> zeroinitializer
+  %p2 = getelementptr inbounds i16, i16* %p, i32 1
+  %b1 = load i16, i16* %p2, align 2
+  %b2 = insertelement <4 x i16> undef, i16 %b1, i32 0
+  %b3 = shufflevector <4 x i16> %b2, <4 x i16> undef, <4 x i32> zeroinitializer
+  %shuffle = shufflevector <4 x i16> %a3, <4 x i16> %b3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll

index 394ecfb..371843c 100644 (file)
--- a/llvm/test/CodeGen/ARM/vext.ll
+++ b/llvm/test/CodeGen/ARM/vext.ll
@@ -164,16 +164,25 @@ define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind {
  ; The actual shuffle code only handles some cases, make sure we check
  ; this rather than blindly emitting a VECTOR_SHUFFLE (infinite
  ; lowering loop can result otherwise).
+; (There are probably better ways to lower this shuffle, but it's not
+; really important.)
  define <8 x i16> @test_illegal(<8 x i16>* %A, <8 x i16>* %B) nounwind {
  ;CHECK-LABEL: test_illegal:
-;CHECK: vmov.16 [[REG:d[0-9]+]][0]
-;CHECK: vmov.16 [[REG]][1]
-;CHECK: vmov.16 [[REG]][2]
-;CHECK: vmov.16 [[REG]][3]
-;CHECK: vmov.16 [[REG2:d[0-9]+]][0]
-;CHECK: vmov.16 [[REG2]][1]
-;CHECK: vmov.16 [[REG2]][2]
-;CHECK: vmov.16 [[REG2]][3]
+;CHECK:      vmov.u16
+;CHECK-NEXT: vmov.u16
+;CHECK-NEXT: vorr
+;CHECK-NEXT: vorr
+;CHECK-NEXT: vmov.16
+;CHECK-NEXT: vuzp.16
+;CHECK-NEXT: vmov.u16
+;CHECK-NEXT: vmov.16
+;CHECK-NEXT: vuzp.16
+;CHECK-NEXT: vmov.16
+;CHECK-NEXT: vmov.u16
+;CHECK-NEXT: vext.16
+;CHECK-NEXT: vmov.16
+;CHECK-NEXT: vmov r0, r1, d
+;CHECK-NEXT: vmov r2, r3, d
         %tmp1 = load <8 x i16>, <8 x i16>* %A
         %tmp2 = load <8 x i16>, <8 x i16>* %B
         %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 7, i32 5, i32 13, i32 3, i32 2, i32 2, i32 9>
diff --git a/llvm/test/CodeGen/ARM/vtrn.ll b/llvm/test/CodeGen/ARM/vtrn.ll

index 36bcde2..d42a035 100644 (file)
--- a/llvm/test/CodeGen/ARM/vtrn.ll
+++ b/llvm/test/CodeGen/ARM/vtrn.ll
@@ -372,13 +372,18 @@ define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1,
    ret <8 x i8> %rv
  }
  
-; Negative test that should not generate a vtrn
+; The shuffle mask is half a vtrn; we duplicate the half to produce the
+; full result.
  define void @lower_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
  entry:
    ; CHECK-LABEL: lower_twice_no_vtrn
    ; CHECK: @ BB#0:
-  ; CHECK-NOT: vtrn
-  ; CHECK: mov pc, lr
+  ; CHECK-NEXT: vldr d16, [r1]
+  ; CHECK-NEXT: vldr d18, [r0]
+  ; CHECK-NEXT: vtrn.16 d18, d16
+  ; CHECK-NEXT: vorr d17, d16, d16
+  ; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
+  ; CHECK-NEXT: mov pc, lr
    %tmp1 = load <4 x i16>, <4 x i16>* %A
    %tmp2 = load <4 x i16>, <4 x i16>* %B
    %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 5, i32 3, i32 7, i32 1, i32 5, i32 3, i32 7>
@@ -386,13 +391,18 @@ entry:
    ret void
  }
  
-; Negative test that should not generate a vtrn
+; The shuffle mask is half a vtrn; we duplicate the half to produce the
+; full result.
  define void @upper_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) {
  entry:
    ; CHECK-LABEL: upper_twice_no_vtrn
    ; CHECK: @ BB#0:
-  ; CHECK-NOT: vtrn
-  ; CHECK: mov pc, lr
+  ; CHECK-NEXT: vldr d16, [r1]
+  ; CHECK-NEXT: vldr d18, [r0]
+  ; CHECK-NEXT: vtrn.16 d18, d16
+  ; CHECK-NEXT: vorr d19, d18, d18
+  ; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
+  ; CHECK-NEXT: mov pc, lr
    %tmp1 = load <4 x i16>, <4 x i16>* %A
    %tmp2 = load <4 x i16>, <4 x i16>* %B
    %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 6, i32 0, i32 4, i32 2, i32 6>
author	Eli Friedman <efriedma@codeaurora.org>
	Wed, 14 Dec 2016 20:44:38 +0000 (20:44 +0000)
committer	Eli Friedman <efriedma@codeaurora.org>
	Wed, 14 Dec 2016 20:44:38 +0000 (20:44 +0000)
llvm/lib/Target/ARM/ARMISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/ARM/big-endian-vector-callee.ll		patch \| blob \| history
llvm/test/CodeGen/ARM/vcombine.ll		patch \| blob \| history
llvm/test/CodeGen/ARM/vext.ll		patch \| blob \| history
llvm/test/CodeGen/ARM/vtrn.ll		patch \| blob \| history