From 4c86dd903265be9fd72a5ebf7c568a15f3cad0a6 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Wed, 26 Jun 2019 17:19:12 +0000
Subject: [PATCH] Allow matching extend-from-memory with strict FP nodes

This implements a small enhancement to https://reviews.llvm.org/D55506

Specifically, while we were able to match strict FP nodes for
floating-point extend operations with a register as source, this
did not work for operations with memory as source.

That is because from regular operations, this is represented as
a combined "extload" node (which is a variant of a load SD node);
but there is no equivalent using a strict FP operation.

However, it turns out that even in the absence of an extload
node, we can still just match the operations explicitly, e.g.
   (strict_fpextend (f32 (load node:$ptr))

This patch implements that method to match the LDEB/LXEB/LXDB
SystemZ instructions even when the extend uses a strict-FP node.

llvm-svn: 364450
---
 llvm/include/llvm/Target/TargetSelectionDAG.td     | 13 ++++
 llvm/lib/Target/SystemZ/SystemZInstrFP.td          | 10 +--
 llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll     | 65 +++++++++++++++++---
 llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll     | 71 ++++++++++++++++++++--
 llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll     | 71 ++++++++++++++++++++--
 .../SystemZ/vector-constrained-fp-intrinsics.ll    | 52 ++++++----------
 6 files changed, 227 insertions(+), 55 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index c499c01..3b5c767 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1224,6 +1224,13 @@ def setle  : PatFrag<(ops node:$lhs, node:$rhs),
 def setne  : PatFrag<(ops node:$lhs, node:$rhs),
                      (setcc node:$lhs, node:$rhs, SETNE)>;
 
+// We don't have strict FP extended loads as single DAG nodes, but we can
+// still provide convenience fragments to match those operations.
+def strict_extloadf32 : PatFrag<(ops node:$ptr),
+                                (strict_fpextend (f32 (load node:$ptr)))>;
+def strict_extloadf64 : PatFrag<(ops node:$ptr),
+                                (strict_fpextend (f64 (load node:$ptr)))>;
+
 // Convenience fragments to match both strict and non-strict fp operations
 def any_fadd       : PatFrags<(ops node:$lhs, node:$rhs),
                               [(strict_fadd node:$lhs, node:$rhs),
@@ -1291,6 +1298,12 @@ def any_fpround    : PatFrags<(ops node:$src),
 def any_fpextend   : PatFrags<(ops node:$src),
                               [(strict_fpextend node:$src),
                                (fpextend node:$src)]>;
+def any_extloadf32 : PatFrags<(ops node:$ptr),
+                              [(strict_extloadf32 node:$ptr),
+                               (extloadf32 node:$ptr)]>;
+def any_extloadf64 : PatFrags<(ops node:$ptr),
+                              [(strict_extloadf64 node:$ptr),
+                               (extloadf64 node:$ptr)]>;
 
 multiclass binary_atomic_op_ord<SDNode atomic_op> {
   def #NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val),
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 55b1789..19c7ec5 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -208,14 +208,14 @@ let Predicates = [FeatureNoVectorEnhancements1] in {
 
 // Extend memory floating-point values to wider representations.
 let Uses = [FPC], mayRaiseFPException = 1 in {
-  def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64,  4>;
-  def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag,  FP128, 4>;
-  def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag,  FP128, 8>;
+  def LDEB : UnaryRXE<"ldeb", 0xED04, any_extloadf32, FP64, 4>;
+  def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>;
+  def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>;
 }
 let Predicates = [FeatureNoVectorEnhancements1] in {
-  def : Pat<(f128 (extloadf32 bdxaddr12only:$src)),
+  def : Pat<(f128 (any_extloadf32 bdxaddr12only:$src)),
             (LXEB bdxaddr12only:$src)>;
-  def : Pat<(f128 (extloadf64 bdxaddr12only:$src)),
+  def : Pat<(f128 (any_extloadf64 bdxaddr12only:$src)),
             (LXDB bdxaddr12only:$src)>;
 }
 
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll
index 2ac3755..0f24b91 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll
@@ -1,9 +1,6 @@
 ; Test strict extensions of f32 to f64.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
-; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
-; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
 declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata)
 
@@ -17,17 +14,67 @@ define double @f1(float %val) {
   ret double %res
 }
 
-; Check extension from memory.
-; FIXME: This should really use LDEB, but there is no strict "extload" yet.
+; Check the low end of the LDEB range.
 define double @f2(float *%ptr) {
 ; CHECK-LABEL: f2:
-; CHECK-SCALAR: le %f0, 0(%r2)
-; CHECK-VECTOR: lde %f0, 0(%r2)
-; CHECK: ldebr %f0, %f0
+; CHECK: ldeb %f0, 0(%r2)
+; CHECK: br %r14
+  %val = load float, float *%ptr
+  %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val,
+                                               metadata !"fpexcept.strict")
+  ret double %res
+}
+
+; Check the high end of the aligned LDEB range.
+define double @f3(float *%base) {
+; CHECK-LABEL: f3:
+; CHECK: ldeb %f0, 4092(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i64 1023
+  %val = load float, float *%ptr
+  %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val,
+                                               metadata !"fpexcept.strict")
+  ret double %res
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define double @f4(float *%base) {
+; CHECK-LABEL: f4:
+; CHECK: aghi %r2, 4096
+; CHECK: ldeb %f0, 0(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i64 1024
+  %val = load float, float *%ptr
+  %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val,
+                                               metadata !"fpexcept.strict")
+  ret double %res
+}
+
+; Check negative displacements, which also need separate address logic.
+define double @f5(float *%base) {
+; CHECK-LABEL: f5:
+; CHECK: aghi %r2, -4
+; CHECK: ldeb %f0, 0(%r2)
 ; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i64 -1
   %val = load float, float *%ptr
   %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val,
                                                metadata !"fpexcept.strict")
   ret double %res
 }
 
+; Check that LDEB allows indices.
+define double @f6(float *%base, i64 %index) {
+; CHECK-LABEL: f6:
+; CHECK: sllg %r1, %r3, 2
+; CHECK: ldeb %f0, 400(%r1,%r2)
+; CHECK: br %r14
+  %ptr1 = getelementptr float, float *%base, i64 %index
+  %ptr2 = getelementptr float, float *%ptr1, i64 100
+  %val = load float, float *%ptr2
+  %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val,
+                                               metadata !"fpexcept.strict")
+  ret double %res
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll
index 1fd04b1..b3fbac9 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll
@@ -17,15 +17,61 @@ define void @f1(fp128 *%dst, float %val) {
   ret void
 }
 
-; Check extension from memory.
-; FIXME: This should really use LXEB, but there is no strict "extload" yet.
+; Check the low end of the LXEB range.
 define void @f2(fp128 *%dst, float *%ptr) {
 ; CHECK-LABEL: f2:
-; CHECK: le %f0, 0(%r3)
-; CHECK: lxebr %f0, %f0
+; CHECK: lxeb %f0, 0(%r3)
+; CHECK: std %f0, 0(%r2)
+; CHECK: std %f2, 8(%r2)
+; CHECK: br %r14
+  %val = load float, float *%ptr
+  %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val,
+                                               metadata !"fpexcept.strict")
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+; Check the high end of the aligned LXEB range.
+define void @f3(fp128 *%dst, float *%base) {
+; CHECK-LABEL: f3:
+; CHECK: lxeb %f0, 4092(%r3)
+; CHECK: std %f0, 0(%r2)
+; CHECK: std %f2, 8(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i64 1023
+  %val = load float, float *%ptr
+  %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val,
+                                               metadata !"fpexcept.strict")
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f4(fp128 *%dst, float *%base) {
+; CHECK-LABEL: f4:
+; CHECK: aghi %r3, 4096
+; CHECK: lxeb %f0, 0(%r3)
+; CHECK: std %f0, 0(%r2)
+; CHECK: std %f2, 8(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i64 1024
+  %val = load float, float *%ptr
+  %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val,
+                                               metadata !"fpexcept.strict")
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+; Check negative displacements, which also need separate address logic.
+define void @f5(fp128 *%dst, float *%base) {
+; CHECK-LABEL: f5:
+; CHECK: aghi %r3, -4
+; CHECK: lxeb %f0, 0(%r3)
 ; CHECK: std %f0, 0(%r2)
 ; CHECK: std %f2, 8(%r2)
 ; CHECK: br %r14
+  %ptr = getelementptr float, float *%base, i64 -1
   %val = load float, float *%ptr
   %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val,
                                                metadata !"fpexcept.strict")
@@ -33,3 +79,20 @@ define void @f2(fp128 *%dst, float *%ptr) {
   ret void
 }
 
+; Check that LXEB allows indices.
+define void @f6(fp128 *%dst, float *%base, i64 %index) {
+; CHECK-LABEL: f6:
+; CHECK: sllg %r1, %r4, 2
+; CHECK: lxeb %f0, 400(%r1,%r3)
+; CHECK: std %f0, 0(%r2)
+; CHECK: std %f2, 8(%r2)
+; CHECK: br %r14
+  %ptr1 = getelementptr float, float *%base, i64 %index
+  %ptr2 = getelementptr float, float *%ptr1, i64 100
+  %val = load float, float *%ptr2
+  %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val,
+                                               metadata !"fpexcept.strict")
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll
index 5c67fff..657cdcd 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll
@@ -17,15 +17,61 @@ define void @f1(fp128 *%dst, double %val) {
   ret void
 }
 
-; Check extension from memory.
-; FIXME: This should really use LXDB, but there is no strict "extload" yet.
+; Check the low end of the LXDB range.
 define void @f2(fp128 *%dst, double *%ptr) {
 ; CHECK-LABEL: f2:
-; CHECK: ld %f0, 0(%r3)
-; CHECK: lxdbr %f0, %f0
+; CHECK: lxdb %f0, 0(%r3)
+; CHECK: std %f0, 0(%r2)
+; CHECK: std %f2, 8(%r2)
+; CHECK: br %r14
+  %val = load double, double *%ptr
+  %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val,
+                                               metadata !"fpexcept.strict")
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+; Check the high end of the aligned LXDB range.
+define void @f3(fp128 *%dst, double *%base) {
+; CHECK-LABEL: f3:
+; CHECK: lxdb %f0, 4088(%r3)
+; CHECK: std %f0, 0(%r2)
+; CHECK: std %f2, 8(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr double, double *%base, i64 511
+  %val = load double, double *%ptr
+  %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val,
+                                               metadata !"fpexcept.strict")
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+; Check the next doubleword up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f4(fp128 *%dst, double *%base) {
+; CHECK-LABEL: f4:
+; CHECK: aghi %r3, 4096
+; CHECK: lxdb %f0, 0(%r3)
+; CHECK: std %f0, 0(%r2)
+; CHECK: std %f2, 8(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr double, double *%base, i64 512
+  %val = load double, double *%ptr
+  %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val,
+                                               metadata !"fpexcept.strict")
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+; Check negative displacements, which also need separate address logic.
+define void @f5(fp128 *%dst, double *%base) {
+; CHECK-LABEL: f5:
+; CHECK: aghi %r3, -8
+; CHECK: lxdb %f0, 0(%r3)
 ; CHECK: std %f0, 0(%r2)
 ; CHECK: std %f2, 8(%r2)
 ; CHECK: br %r14
+  %ptr = getelementptr double, double *%base, i64 -1
   %val = load double, double *%ptr
   %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val,
                                                metadata !"fpexcept.strict")
@@ -33,3 +79,20 @@ define void @f2(fp128 *%dst, double *%ptr) {
   ret void
 }
 
+; Check that LXDB allows indices.
+define void @f6(fp128 *%dst, double *%base, i64 %index) {
+; CHECK-LABEL: f6:
+; CHECK: sllg %r1, %r4, 3
+; CHECK: lxdb %f0, 800(%r1,%r3)
+; CHECK: std %f0, 0(%r2)
+; CHECK: std %f2, 8(%r2)
+; CHECK: br %r14
+  %ptr1 = getelementptr double, double *%base, i64 %index
+  %ptr2 = getelementptr double, double *%ptr1, i64 100
+  %val = load double, double *%ptr2
+  %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val,
+                                               metadata !"fpexcept.strict")
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
index aab60ce..ec1e674 100644
--- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
@@ -5504,15 +5504,14 @@ define <1 x double> @constrained_vector_fpext_v1f32() {
 ; S390X-LABEL: constrained_vector_fpext_v1f32:
 ; S390X:       # %bb.0: # %entry
 ; S390X-NEXT:    larl %r1, .LCPI99_0
-; S390X-NEXT:    le %f0, 0(%r1)
-; S390X-NEXT:    ldebr %f0, %f0
+; S390X-NEXT:    ldeb %f0, 0(%r1)
 ; S390X-NEXT:    br %r14
 ;
 ; SZ13-LABEL: constrained_vector_fpext_v1f32:
 ; SZ13:       # %bb.0: # %entry
 ; SZ13-NEXT:    larl %r1, .LCPI99_0
-; SZ13-NEXT:    lde %f0, 0(%r1)
-; SZ13-NEXT:    wldeb %v24, %f0
+; SZ13-NEXT:    ldeb %f0, 0(%r1)
+; SZ13-NEXT:    vlr %v24, %v0
 ; SZ13-NEXT:    br %r14
 entry:
   %result = call <1 x double> @llvm.experimental.constrained.fpext.v1f64.v1f32(
@@ -5525,21 +5524,17 @@ define <2 x double> @constrained_vector_fpext_v2f32() {
 ; S390X-LABEL: constrained_vector_fpext_v2f32:
 ; S390X:       # %bb.0: # %entry
 ; S390X-NEXT:    larl %r1, .LCPI100_0
-; S390X-NEXT:    le %f0, 0(%r1)
+; S390X-NEXT:    ldeb %f2, 0(%r1)
 ; S390X-NEXT:    larl %r1, .LCPI100_1
-; S390X-NEXT:    le %f1, 0(%r1)
-; S390X-NEXT:    ldebr %f2, %f0
-; S390X-NEXT:    ldebr %f0, %f1
+; S390X-NEXT:    ldeb %f0, 0(%r1)
 ; S390X-NEXT:    br %r14
 ;
 ; SZ13-LABEL: constrained_vector_fpext_v2f32:
 ; SZ13:       # %bb.0: # %entry
 ; SZ13-NEXT:    larl %r1, .LCPI100_0
-; SZ13-NEXT:    lde %f0, 0(%r1)
+; SZ13-NEXT:    ldeb %f0, 0(%r1)
 ; SZ13-NEXT:    larl %r1, .LCPI100_1
-; SZ13-NEXT:    lde %f1, 0(%r1)
-; SZ13-NEXT:    ldebr %f0, %f0
-; SZ13-NEXT:    ldebr %f1, %f1
+; SZ13-NEXT:    ldeb %f1, 0(%r1)
 ; SZ13-NEXT:    vmrhg %v24, %v1, %v0
 ; SZ13-NEXT:    br %r14
 entry:
@@ -5553,16 +5548,15 @@ define void @constrained_vector_fpext_v3f64(<3 x float>* %src, <3 x double>* %de
 ; S390X-LABEL: constrained_vector_fpext_v3f64:
 ; S390X:       # %bb.0: # %entry
 ; S390X-NEXT:    lg %r0, 0(%r2)
-; S390X-NEXT:    le %f0, 8(%r2)
 ; S390X-NEXT:    sllg %r1, %r0, 32
-; S390X-NEXT:    ldgr %f1, %r1
+; S390X-NEXT:    ldgr %f0, %r1
 ; S390X-NEXT:    nilf %r0, 0
+; S390X-NEXT:    ldeb %f1, 8(%r2)
 ; S390X-NEXT:    ldgr %f2, %r0
 ; S390X-NEXT:    ldebr %f2, %f2
-; S390X-NEXT:    ldebr %f1, %f1
 ; S390X-NEXT:    ldebr %f0, %f0
-; S390X-NEXT:    std %f0, 16(%r3)
-; S390X-NEXT:    std %f1, 8(%r3)
+; S390X-NEXT:    std %f1, 16(%r3)
+; S390X-NEXT:    std %f0, 8(%r3)
 ; S390X-NEXT:    std %f2, 0(%r3)
 ; S390X-NEXT:    br %r14
 ;
@@ -5591,34 +5585,26 @@ define <4 x double> @constrained_vector_fpext_v4f32() {
 ; S390X-LABEL: constrained_vector_fpext_v4f32:
 ; S390X:       # %bb.0: # %entry
 ; S390X-NEXT:    larl %r1, .LCPI102_0
-; S390X-NEXT:    le %f0, 0(%r1)
+; S390X-NEXT:    ldeb %f6, 0(%r1)
 ; S390X-NEXT:    larl %r1, .LCPI102_1
-; S390X-NEXT:    le %f1, 0(%r1)
+; S390X-NEXT:    ldeb %f4, 0(%r1)
 ; S390X-NEXT:    larl %r1, .LCPI102_2
-; S390X-NEXT:    le %f2, 0(%r1)
+; S390X-NEXT:    ldeb %f2, 0(%r1)
 ; S390X-NEXT:    larl %r1, .LCPI102_3
-; S390X-NEXT:    le %f3, 0(%r1)
-; S390X-NEXT:    ldebr %f6, %f0
-; S390X-NEXT:    ldebr %f4, %f1
-; S390X-NEXT:    ldebr %f2, %f2
-; S390X-NEXT:    ldebr %f0, %f3
+; S390X-NEXT:    ldeb %f0, 0(%r1)
 ; S390X-NEXT:    br %r14
 ;
 ; SZ13-LABEL: constrained_vector_fpext_v4f32:
 ; SZ13:       # %bb.0: # %entry
 ; SZ13-NEXT:    larl %r1, .LCPI102_0
-; SZ13-NEXT:    lde %f0, 0(%r1)
+; SZ13-NEXT:    ldeb %f0, 0(%r1)
 ; SZ13-NEXT:    larl %r1, .LCPI102_1
-; SZ13-NEXT:    lde %f1, 0(%r1)
-; SZ13-NEXT:    ldebr %f0, %f0
-; SZ13-NEXT:    ldebr %f1, %f1
+; SZ13-NEXT:    ldeb %f1, 0(%r1)
 ; SZ13-NEXT:    larl %r1, .LCPI102_2
 ; SZ13-NEXT:    vmrhg %v24, %v1, %v0
-; SZ13-NEXT:    lde %f0, 0(%r1)
+; SZ13-NEXT:    ldeb %f0, 0(%r1)
 ; SZ13-NEXT:    larl %r1, .LCPI102_3
-; SZ13-NEXT:    lde %f1, 0(%r1)
-; SZ13-NEXT:    ldebr %f0, %f0
-; SZ13-NEXT:    ldebr %f1, %f1
+; SZ13-NEXT:    ldeb %f1, 0(%r1)
 ; SZ13-NEXT:    vmrhg %v26, %v1, %v0
 ; SZ13-NEXT:    br %r14
 entry:
-- 
2.7.4