[PowerPC] Load two floats directly instead of using one 64-bit integer load

author Hal Finkel <hfinkel@anl.gov>

Thu, 31 Mar 2016 02:56:05 +0000 (02:56 +0000)

committer Hal Finkel <hfinkel@anl.gov>

Thu, 31 Mar 2016 02:56:05 +0000 (02:56 +0000)
author Hal Finkel <hfinkel@anl.gov>
Thu, 31 Mar 2016 02:56:05 +0000 (02:56 +0000)
committer Hal Finkel <hfinkel@anl.gov>
Thu, 31 Mar 2016 02:56:05 +0000 (02:56 +0000)
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp

index f3251ba..d0f4343 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -10268,6 +10268,111 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
          return expandVSXLoadForLE(N, DCI);
      }
  
+    // We sometimes end up with a 64-bit integer load, from which we extract
+    // two single-precision floating-point numbers. This happens with
+    // std::complex<float>, and other similar structures, because of the way we
+    // canonicalize structure copies. However, if we lack direct moves,
+    // then the final bitcasts from the extracted integer values to the
+    // floating-point numbers turn into store/load pairs. Even with direct moves,
+    // just loading the two floating-point numbers is likely better.
+    auto ReplaceTwoFloatLoad = [&]() {
+      if (VT != MVT::i64)
+        return false;
+
+      if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
+          LD->isVolatile())
+        return false;
+
+      //  We're looking for a sequence like this:
+      //  t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
+      //      t16: i64 = srl t13, Constant:i32<32>
+      //    t17: i32 = truncate t16
+      //  t18: f32 = bitcast t17
+      //    t19: i32 = truncate t13
+      //  t20: f32 = bitcast t19
+
+      if (!LD->hasNUsesOfValue(2, 0))
+        return false;
+
+      auto UI = LD->use_begin();
+      while (UI.getUse().getResNo() != 0) ++UI;
+      SDNode *Trunc = *UI++;
+      while (UI.getUse().getResNo() != 0) ++UI;
+      SDNode *RightShift = *UI;
+      if (Trunc->getOpcode() != ISD::TRUNCATE)
+        std::swap(Trunc, RightShift);
+
+      if (Trunc->getOpcode() != ISD::TRUNCATE ||
+          Trunc->getValueType(0) != MVT::i32 ||
+          !Trunc->hasOneUse())
+        return false;
+      if (RightShift->getOpcode() != ISD::SRL ||
+          !isa<ConstantSDNode>(RightShift->getOperand(1)) ||
+          RightShift->getConstantOperandVal(1) != 32 ||
+          !RightShift->hasOneUse())
+        return false;
+
+      SDNode *Trunc2 = *RightShift->use_begin();
+      if (Trunc2->getOpcode() != ISD::TRUNCATE ||
+          Trunc2->getValueType(0) != MVT::i32 ||
+          !Trunc2->hasOneUse())
+        return false;
+
+      SDNode *Bitcast = *Trunc->use_begin();
+      SDNode *Bitcast2 = *Trunc2->use_begin();
+
+      if (Bitcast->getOpcode() != ISD::BITCAST ||
+          Bitcast->getValueType(0) != MVT::f32)
+        return false;
+      if (Bitcast2->getOpcode() != ISD::BITCAST ||          
+          Bitcast2->getValueType(0) != MVT::f32)
+        return false;
+
+      if (Subtarget.isLittleEndian())
+        std::swap(Bitcast, Bitcast2);
+
+      // Bitcast has the second float (in memory-layout order) and Bitcast2
+      // has the first one.
+
+      SDValue BasePtr = LD->getBasePtr();
+      if (LD->isIndexed()) {
+        assert(LD->getAddressingMode() == ISD::PRE_INC &&
+               "Non-pre-inc AM on PPC?");
+        BasePtr =
+          DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                      LD->getOffset());
+      }
+
+      SDValue FloatLoad =
+        DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
+                    LD->getPointerInfo(), false, LD->isNonTemporal(),
+                    LD->isInvariant(), LD->getAlignment(), LD->getAAInfo());
+      SDValue AddPtr =
+        DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                    BasePtr, DAG.getIntPtrConstant(4, dl));
+      SDValue FloatLoad2 =
+        DAG.getLoad(MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
+                    LD->getPointerInfo().getWithOffset(4), false,
+                    LD->isNonTemporal(), LD->isInvariant(),
+                    MinAlign(LD->getAlignment(), 4), LD->getAAInfo());
+
+      if (LD->isIndexed()) {
+       // Note that DAGCombine should re-form any pre-increment load(s) from
+       // what is produced here if that makes sense.
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
+      }
+
+      DCI.CombineTo(Bitcast2, FloatLoad);
+      DCI.CombineTo(Bitcast, FloatLoad2);
+
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
+                                    SDValue(FloatLoad2.getNode(), 1));
+      return true;
+    };
+
+    if (ReplaceTwoFloatLoad())
+      return SDValue(N, 0);
+
      EVT MemVT = LD->getMemoryVT();
      Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
      unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
diff --git a/llvm/test/CodeGen/PowerPC/load-two-flts.ll b/llvm/test/CodeGen/PowerPC/load-two-flts.ll

new file mode 100644 (file)

index 0000000..270a852
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/load-two-flts.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
+entry:
+  %v2 = load i64, i64* %ref.tmp, align 8
+  %v3 = lshr i64 %v2, 32
+  %v4 = trunc i64 %v3 to i32
+  %v5 = bitcast i32 %v4 to float
+  %v6 = trunc i64 %v2 to i32
+  %v7 = bitcast i32 %v6 to float
+  %mul_ad.i.i = fmul fast float %v5, %v1
+  %mul_bc.i.i = fmul fast float %v7, %v0
+  %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
+  %mul_ac.i.i = fmul fast float %v5, %v0
+  %mul_bd.i.i = fmul fast float %v7, %v1
+  %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
+  store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
+  store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
+  ret void
+
+; CHECK-LABEL: @_Z4testSt7complexIfE
+; CHECK-NOT: ld {{[0-9]+}}, 0(5)
+; CHECK-NOT: stw
+; CHECK-NOT: rldicl
+; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
+; CHECK-DAG: lfs {{[0-9]+}}, 0(5)
+; CHECK: blr
+}
+
+define i64* @_Z4testSt7complexIfE_idx(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
+entry:
+  %r = getelementptr i64, i64* %ref.tmp, i64 1
+  %v2 = load i64, i64* %r, align 8
+  %v3 = lshr i64 %v2, 32
+  %v4 = trunc i64 %v3 to i32
+  %v5 = bitcast i32 %v4 to float
+  %v6 = trunc i64 %v2 to i32
+  %v7 = bitcast i32 %v6 to float
+  %mul_ad.i.i = fmul fast float %v5, %v1
+  %mul_bc.i.i = fmul fast float %v7, %v0
+  %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
+  %mul_ac.i.i = fmul fast float %v5, %v0
+  %mul_bd.i.i = fmul fast float %v7, %v1
+  %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
+  store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
+  store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
+  ret i64* %r
+
+; CHECK-LABEL: @_Z4testSt7complexIfE
+; CHECK-NOT: ld {{[0-9]+}}, 8(5)
+; CHECK-NOT: ldu {{[0-9]+}}, 8(5)
+; CHECK-NOT: stw
+; CHECK-NOT: rldicl
+; CHECK-DAG: lfsu {{[0-9]+}}, 8(5)
+; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
+; CHECK: blr
+}
+
author	Hal Finkel <hfinkel@anl.gov>
	Thu, 31 Mar 2016 02:56:05 +0000 (02:56 +0000)
committer	Hal Finkel <hfinkel@anl.gov>
	Thu, 31 Mar 2016 02:56:05 +0000 (02:56 +0000)
llvm/lib/Target/PowerPC/PPCISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/PowerPC/load-two-flts.ll	[new file with mode: 0644]	patch \| blob