From f43b5100150f21ab4cd9c4591b6a219f9cc3ba4c Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Thu, 20 Dec 2018 12:59:05 +0000
Subject: [PATCH] [SystemZ] Make better use of VLDEB

We already have special code (DAG combine support for FP_ROUND)
to recognize cases where we an use a vector version of VLEDB to
perform two floating-point truncates in parallel, but equivalent
support for VLEDB (vector floating-point extends) has been
missing so far.  This patch adds corresponding DAG combine
support for FP_EXTEND.

llvm-svn: 349746
---
 llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 51 ++++++++++++++++++++++++-
 llvm/lib/Target/SystemZ/SystemZISelLowering.h   |  1 +
 llvm/test/CodeGen/SystemZ/vec-conv-02.ll        | 17 +++++++--
 3 files changed, 65 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index d7951ca..cdec66b 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -527,6 +527,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::FP_EXTEND);
   setTargetDAGCombine(ISD::BSWAP);
   setTargetDAGCombine(ISD::SDIV);
   setTargetDAGCombine(ISD::UDIV);
@@ -5485,7 +5486,7 @@ SDValue SystemZTargetLowering::combineFP_ROUND(
   // (fpround (extract_vector_elt X 0))
   // (fpround (extract_vector_elt X 1)) ->
   // (extract_vector_elt (VROUND X) 0)
-  // (extract_vector_elt (VROUND X) 1)
+  // (extract_vector_elt (VROUND X) 2)
   //
   // This is a special case since the target doesn't really support v2f32s.
   SelectionDAG &DAG = DCI.DAG;
@@ -5527,6 +5528,53 @@ SDValue SystemZTargetLowering::combineFP_ROUND(
   return SDValue();
 }
 
+SDValue SystemZTargetLowering::combineFP_EXTEND(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  // (fpextend (extract_vector_elt X 0))
+  // (fpextend (extract_vector_elt X 2)) ->
+  // (extract_vector_elt (VEXTEND X) 0)
+  // (extract_vector_elt (VEXTEND X) 1)
+  //
+  // This is a special case since the target doesn't really support v2f32s.
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op0 = N->getOperand(0);
+  if (N->getValueType(0) == MVT::f64 &&
+      Op0.hasOneUse() &&
+      Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      Op0.getOperand(0).getValueType() == MVT::v4f32 &&
+      Op0.getOperand(1).getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
+    SDValue Vec = Op0.getOperand(0);
+    for (auto *U : Vec->uses()) {
+      if (U != Op0.getNode() &&
+          U->hasOneUse() &&
+          U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          U->getOperand(0) == Vec &&
+          U->getOperand(1).getOpcode() == ISD::Constant &&
+          cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 2) {
+        SDValue OtherExtend = SDValue(*U->use_begin(), 0);
+        if (OtherExtend.getOpcode() == ISD::FP_EXTEND &&
+            OtherExtend.getOperand(0) == SDValue(U, 0) &&
+            OtherExtend.getValueType() == MVT::f64) {
+          SDValue VExtend = DAG.getNode(SystemZISD::VEXTEND, SDLoc(N),
+                                        MVT::v2f64, Vec);
+          DCI.AddToWorklist(VExtend.getNode());
+          SDValue Extract1 =
+            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f64,
+                        VExtend, DAG.getConstant(1, SDLoc(U), MVT::i32));
+          DCI.AddToWorklist(Extract1.getNode());
+          DAG.ReplaceAllUsesOfValueWith(OtherExtend, Extract1);
+          SDValue Extract0 =
+            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f64,
+                        VExtend, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
+          return Extract0;
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
 SDValue SystemZTargetLowering::combineBSWAP(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -5745,6 +5793,7 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI);
   case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI);
   case ISD::FP_ROUND:           return combineFP_ROUND(N, DCI);
+  case ISD::FP_EXTEND:          return combineFP_EXTEND(N, DCI);
   case ISD::BSWAP:              return combineBSWAP(N, DCI);
   case SystemZISD::BR_CCMASK:   return combineBR_CCMASK(N, DCI);
   case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 172dbee..622da32 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -592,6 +592,7 @@ private:
   SDValue combineEXTRACT_VECTOR_ELT(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineFP_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineBR_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/SystemZ/vec-conv-02.ll b/llvm/test/CodeGen/SystemZ/vec-conv-02.ll
index ab84389..d4c0f72 100644
--- a/llvm/test/CodeGen/SystemZ/vec-conv-02.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-conv-02.ll
@@ -15,19 +15,30 @@ define void @f1(<2 x double> %val, <2 x float> *%ptr) {
 ; Test conversion of an f64 in a vector register to an f32.
 define float @f2(<2 x double> %vec) {
 ; CHECK-LABEL: f2:
-; CHECK: wledb %f0, %v24
+; CHECK: wledb %f0, %v24, 0, 0
 ; CHECK: br %r14
   %scalar = extractelement <2 x double> %vec, i32 0
   %ret = fptrunc double %scalar to float
   ret float %ret
 }
 
-; Test conversion of an f32 in a vector register to an f64.
-define double @f3(<4 x float> %vec) {
+; Test cases where even elements of a v4f32 are converted to f64s.
+define <2 x double> @f3(<4 x float> %vec) {
 ; CHECK-LABEL: f3:
+; CHECK: vldeb %v24, {{%v[0-9]+}}
+; CHECK: br %r14
+  %shuffle = shufflevector <4 x float> %vec, <4 x float> undef, <2 x i32> <i32 0, i32 2>
+  %res = fpext <2 x float> %shuffle to <2 x double>
+  ret <2 x double> %res
+}
+
+; Test conversion of an f32 in a vector register to an f64.
+define double @f4(<4 x float> %vec) {
+; CHECK-LABEL: f4:
 ; CHECK: wldeb %f0, %v24
 ; CHECK: br %r14
   %scalar = extractelement <4 x float> %vec, i32 0
   %ret = fpext float %scalar to double
   ret double %ret
 }
+
-- 
2.7.4