[AArch64] Handle vector types in replaceZeroVectorStore.

author Geoff Berry <gberry@codeaurora.org>

Wed, 16 Nov 2016 19:35:19 +0000 (19:35 +0000)

committer Geoff Berry <gberry@codeaurora.org>

Wed, 16 Nov 2016 19:35:19 +0000 (19:35 +0000)
author Geoff Berry <gberry@codeaurora.org>
Wed, 16 Nov 2016 19:35:19 +0000 (19:35 +0000)
committer Geoff Berry <gberry@codeaurora.org>
Wed, 16 Nov 2016 19:35:19 +0000 (19:35 +0000)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

index b8d6795..019d72d 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8844,13 +8844,10 @@ static SDValue performExtendCombine(SDNode *N,
    return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
  }
  
-static SDValue split16BStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
-                                  SDValue SplatVal, unsigned NumVecElts) {
-  assert((NumVecElts == 4 || NumVecElts == 2) && "Unexpected NumVecElts");
-
+static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
+                               SDValue SplatVal, unsigned NumVecElts) {
    unsigned OrigAlignment = St.getAlignment();
-  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
-  unsigned Alignment = std::min(OrigAlignment, EltOffset);
+  unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
  
    // Create scalar stores. This is at least as good as the code sequence for a
    // split unaligned store which is a dup.s, ext.b, and two stores.
@@ -8860,10 +8857,11 @@ static SDValue split16BStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
    SDValue BasePtr = St.getBasePtr();
    SDValue NewST1 =
        DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, St.getPointerInfo(),
-                   St.getAlignment(), St.getMemOperand()->getFlags());
+                   OrigAlignment, St.getMemOperand()->getFlags());
  
    unsigned Offset = EltOffset;
    while (--NumVecElts) {
+    unsigned Alignment = MinAlign(OrigAlignment, Offset);
      SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                      DAG.getConstant(Offset, DL, MVT::i64));
      NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
@@ -8893,9 +8891,13 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
    SDValue StVal = St.getValue();
    EVT VT = StVal.getValueType();
  
-  // We can express a splat as store pair(s) for 2 or 4 elements.
+  // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
+  // 2, 3 or 4 i32 elements.
    int NumVecElts = VT.getVectorNumElements();
-  if (NumVecElts != 4 && NumVecElts != 2)
+  if (!(((NumVecElts == 2 || NumVecElts == 3) &&
+         VT.getVectorElementType().getSizeInBits() == 64) ||
+        ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
+         VT.getVectorElementType().getSizeInBits() == 32)))
      return SDValue();
  
    if (StVal.getOpcode() != ISD::BUILD_VECTOR)
@@ -8917,16 +8919,16 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
  
    for (int I = 0; I < NumVecElts; ++I) {
      SDValue EltVal = StVal.getOperand(I);
-    if (!isa<ConstantSDNode>(EltVal) ||
-        !cast<ConstantSDNode>(EltVal)->isNullValue())
+    if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
        return SDValue();
    }
+
    // Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
    // undoing this transformation.
-  return split16BStoreSplat(
-      DAG, St, NumVecElts == 4 ? DAG.getRegister(AArch64::WZR, MVT::i32)
-                               : DAG.getRegister(AArch64::XZR, MVT::i64),
-      NumVecElts);
+  SDValue SplatVal = VT.getVectorElementType().getSizeInBits() == 32
+                         ? DAG.getRegister(AArch64::WZR, MVT::i32)
+                         : DAG.getRegister(AArch64::XZR, MVT::i64);
+  return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
  }
  
  /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
@@ -8979,12 +8981,12 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
    if (IndexNotInserted.any())
        return SDValue();
  
-  return split16BStoreSplat(DAG, St, SplatVal, NumVecElts);
+  return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
  }
  
-static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
-                              SelectionDAG &DAG,
-                              const AArch64Subtarget *Subtarget) {
+static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                           SelectionDAG &DAG,
+                           const AArch64Subtarget *Subtarget) {
    if (!DCI.isBeforeLegalize())
      return SDValue();
  
@@ -9174,7 +9176,7 @@ static SDValue performSTORECombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG,
                                     const AArch64Subtarget *Subtarget) {
-  if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget))
+  if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
      return Split;
  
    if (Subtarget->supportsAddressTopByteIgnored() &&
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll

index 98d4e36..071b2d0 100644 (file)
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -6174,11 +6174,10 @@ define <2 x double> @test_v2f64_post_reg_ld1lane(double* %bar, double** %ptr, i6
  }
  
  ; Check for dependencies between the vector and the scalar load.
-define <4 x float> @test_v4f32_post_reg_ld1lane_dep_vec_on_load(float* %bar, float** %ptr, i64 %inc, <4 x float>* %dep_ptr_1, <4 x float>* %dep_ptr_2) {
+define <4 x float> @test_v4f32_post_reg_ld1lane_dep_vec_on_load(float* %bar, float** %ptr, i64 %inc, <4 x float>* %dep_ptr_1, <4 x float>* %dep_ptr_2, <4 x float> %vec) {
  ; CHECK-LABEL: test_v4f32_post_reg_ld1lane_dep_vec_on_load:
  ; CHECK: BB#0:
  ; CHECK-NEXT: ldr s[[LD:[0-9]+]], [x0]
-; CHECK-NEXT: movi.2d v0, #0000000000000000
  ; CHECK-NEXT: str q0, [x3]
  ; CHECK-NEXT: ldr q0, [x4]
  ; CHECK-NEXT: ins.s v0[1], v[[LD]][0]
@@ -6186,7 +6185,7 @@ define <4 x float> @test_v4f32_post_reg_ld1lane_dep_vec_on_load(float* %bar, flo
  ; CHECK-NEXT: str [[POST]], [x1]
  ; CHECK-NEXT: ret
    %tmp1 = load float, float* %bar
-  store <4 x float> zeroinitializer, <4 x float>* %dep_ptr_1, align 16
+  store <4 x float> %vec, <4 x float>* %dep_ptr_1, align 16
    %A = load <4 x float>, <4 x float>* %dep_ptr_2, align 16
    %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
    %tmp3 = getelementptr float, float* %bar, i64 %inc
diff --git a/llvm/test/CodeGen/AArch64/ldst-opt.ll b/llvm/test/CodeGen/AArch64/ldst-opt.ll

index 5c54255..81e4b19 100644 (file)
--- a/llvm/test/CodeGen/AArch64/ldst-opt.ll
+++ b/llvm/test/CodeGen/AArch64/ldst-opt.ll
@@ -1433,6 +1433,62 @@ entry:
    ret void
  }
  
+; Like merge_zr32, but with 2-vector type.
+define void @merge_zr32_2vec(<2 x i32>* %p) {
+; CHECK-LABEL: merge_zr32_2vec:
+; CHECK: // %entry
+; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store <2 x i32> zeroinitializer, <2 x i32>* %p
+  ret void
+}
+
+; Like merge_zr32, but with 3-vector type.
+define void @merge_zr32_3vec(<3 x i32>* %p) {
+; CHECK-LABEL: merge_zr32_3vec:
+; CHECK: // %entry
+; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: str wzr, [x{{[0-9]+}}, #8]
+; CHECK-NEXT: ret
+entry:
+  store <3 x i32> zeroinitializer, <3 x i32>* %p
+  ret void
+}
+
+; Like merge_zr32, but with 4-vector type.
+define void @merge_zr32_4vec(<4 x i32>* %p) {
+; CHECK-LABEL: merge_zr32_4vec:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store <4 x i32> zeroinitializer, <4 x i32>* %p
+  ret void
+}
+
+; Like merge_zr32, but with 2-vector float type.
+define void @merge_zr32_2vecf(<2 x float>* %p) {
+; CHECK-LABEL: merge_zr32_2vecf:
+; CHECK: // %entry
+; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store <2 x float> zeroinitializer, <2 x float>* %p
+  ret void
+}
+
+; Like merge_zr32, but with 4-vector float type.
+define void @merge_zr32_4vecf(<4 x float>* %p) {
+; CHECK-LABEL: merge_zr32_4vecf:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store <4 x float> zeroinitializer, <4 x float>* %p
+  ret void
+}
+
  ; Similar to merge_zr32, but for 64-bit values.
  define void @merge_zr64(i64* %p) {
  ; CHECK-LABEL: merge_zr64:
@@ -1464,3 +1520,38 @@ entry:
    store i64 0, i64* %p3
    ret void
  }
+
+; Like merge_zr64, but with 2-vector double type.
+define void @merge_zr64_2vecd(<2 x double>* %p) {
+; CHECK-LABEL: merge_zr64_2vecd:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store <2 x double> zeroinitializer, <2 x double>* %p
+  ret void
+}
+
+; Like merge_zr64, but with 3-vector i64 type.
+define void @merge_zr64_3vec(<3 x i64>* %p) {
+; CHECK-LABEL: merge_zr64_3vec:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: str xzr, [x{{[0-9]+}}, #16]
+; CHECK-NEXT: ret
+entry:
+  store <3 x i64> zeroinitializer, <3 x i64>* %p
+  ret void
+}
+
+; Like merge_zr64_2, but with 4-vector double type.
+define void @merge_zr64_4vecd(<4 x double>* %p) {
+; CHECK-LABEL: merge_zr64_4vecd:
+; CHECK: // %entry
+; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store <4 x double> zeroinitializer, <4 x double>* %p
+  ret void
+}
author	Geoff Berry <gberry@codeaurora.org>
	Wed, 16 Nov 2016 19:35:19 +0000 (19:35 +0000)
committer	Geoff Berry <gberry@codeaurora.org>
	Wed, 16 Nov 2016 19:35:19 +0000 (19:35 +0000)
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/ldst-opt.ll		patch \| blob \| history