From 4c86dd903265be9fd72a5ebf7c568a15f3cad0a6 Mon Sep 17 00:00:00 2001 From: Ulrich Weigand Date: Wed, 26 Jun 2019 17:19:12 +0000 Subject: [PATCH] Allow matching extend-from-memory with strict FP nodes This implements a small enhancement to https://reviews.llvm.org/D55506 Specifically, while we were able to match strict FP nodes for floating-point extend operations with a register as source, this did not work for operations with memory as source. That is because from regular operations, this is represented as a combined "extload" node (which is a variant of a load SD node); but there is no equivalent using a strict FP operation. However, it turns out that even in the absence of an extload node, we can still just match the operations explicitly, e.g. (strict_fpextend (f32 (load node:$ptr)) This patch implements that method to match the LDEB/LXEB/LXDB SystemZ instructions even when the extend uses a strict-FP node. llvm-svn: 364450 --- llvm/include/llvm/Target/TargetSelectionDAG.td | 13 ++++ llvm/lib/Target/SystemZ/SystemZInstrFP.td | 10 +-- llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll | 65 +++++++++++++++++--- llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll | 71 ++++++++++++++++++++-- llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll | 71 ++++++++++++++++++++-- .../SystemZ/vector-constrained-fp-intrinsics.ll | 52 ++++++---------- 6 files changed, 227 insertions(+), 55 deletions(-) diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index c499c01..3b5c767 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -1224,6 +1224,13 @@ def setle : PatFrag<(ops node:$lhs, node:$rhs), def setne : PatFrag<(ops node:$lhs, node:$rhs), (setcc node:$lhs, node:$rhs, SETNE)>; +// We don't have strict FP extended loads as single DAG nodes, but we can +// still provide convenience fragments to match those operations. +def strict_extloadf32 : PatFrag<(ops node:$ptr), + (strict_fpextend (f32 (load node:$ptr)))>; +def strict_extloadf64 : PatFrag<(ops node:$ptr), + (strict_fpextend (f64 (load node:$ptr)))>; + // Convenience fragments to match both strict and non-strict fp operations def any_fadd : PatFrags<(ops node:$lhs, node:$rhs), [(strict_fadd node:$lhs, node:$rhs), @@ -1291,6 +1298,12 @@ def any_fpround : PatFrags<(ops node:$src), def any_fpextend : PatFrags<(ops node:$src), [(strict_fpextend node:$src), (fpextend node:$src)]>; +def any_extloadf32 : PatFrags<(ops node:$ptr), + [(strict_extloadf32 node:$ptr), + (extloadf32 node:$ptr)]>; +def any_extloadf64 : PatFrags<(ops node:$ptr), + [(strict_extloadf64 node:$ptr), + (extloadf64 node:$ptr)]>; multiclass binary_atomic_op_ord { def #NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val), diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td index 55b1789..19c7ec5 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td @@ -208,14 +208,14 @@ let Predicates = [FeatureNoVectorEnhancements1] in { // Extend memory floating-point values to wider representations. let Uses = [FPC], mayRaiseFPException = 1 in { - def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64, 4>; - def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>; - def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>; + def LDEB : UnaryRXE<"ldeb", 0xED04, any_extloadf32, FP64, 4>; + def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>; + def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>; } let Predicates = [FeatureNoVectorEnhancements1] in { - def : Pat<(f128 (extloadf32 bdxaddr12only:$src)), + def : Pat<(f128 (any_extloadf32 bdxaddr12only:$src)), (LXEB bdxaddr12only:$src)>; - def : Pat<(f128 (extloadf64 bdxaddr12only:$src)), + def : Pat<(f128 (any_extloadf64 bdxaddr12only:$src)), (LXDB bdxaddr12only:$src)>; } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll index 2ac3755..0f24b91 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll @@ -1,9 +1,6 @@ ; Test strict extensions of f32 to f64. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \ -; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \ -; RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) @@ -17,17 +14,67 @@ define double @f1(float %val) { ret double %res } -; Check extension from memory. -; FIXME: This should really use LDEB, but there is no strict "extload" yet. +; Check the low end of the LDEB range. define double @f2(float *%ptr) { ; CHECK-LABEL: f2: -; CHECK-SCALAR: le %f0, 0(%r2) -; CHECK-VECTOR: lde %f0, 0(%r2) -; CHECK: ldebr %f0, %f0 +; CHECK: ldeb %f0, 0(%r2) +; CHECK: br %r14 + %val = load float, float *%ptr + %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val, + metadata !"fpexcept.strict") + ret double %res +} + +; Check the high end of the aligned LDEB range. +define double @f3(float *%base) { +; CHECK-LABEL: f3: +; CHECK: ldeb %f0, 4092(%r2) +; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i64 1023 + %val = load float, float *%ptr + %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val, + metadata !"fpexcept.strict") + ret double %res +} + +; Check the next word up, which needs separate address logic. +; Other sequences besides this one would be OK. +define double @f4(float *%base) { +; CHECK-LABEL: f4: +; CHECK: aghi %r2, 4096 +; CHECK: ldeb %f0, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i64 1024 + %val = load float, float *%ptr + %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val, + metadata !"fpexcept.strict") + ret double %res +} + +; Check negative displacements, which also need separate address logic. +define double @f5(float *%base) { +; CHECK-LABEL: f5: +; CHECK: aghi %r2, -4 +; CHECK: ldeb %f0, 0(%r2) ; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i64 -1 %val = load float, float *%ptr %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val, metadata !"fpexcept.strict") ret double %res } +; Check that LDEB allows indices. +define double @f6(float *%base, i64 %index) { +; CHECK-LABEL: f6: +; CHECK: sllg %r1, %r3, 2 +; CHECK: ldeb %f0, 400(%r1,%r2) +; CHECK: br %r14 + %ptr1 = getelementptr float, float *%base, i64 %index + %ptr2 = getelementptr float, float *%ptr1, i64 100 + %val = load float, float *%ptr2 + %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val, + metadata !"fpexcept.strict") + ret double %res +} + diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll index 1fd04b1..b3fbac9 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll @@ -17,15 +17,61 @@ define void @f1(fp128 *%dst, float %val) { ret void } -; Check extension from memory. -; FIXME: This should really use LXEB, but there is no strict "extload" yet. +; Check the low end of the LXEB range. define void @f2(fp128 *%dst, float *%ptr) { ; CHECK-LABEL: f2: -; CHECK: le %f0, 0(%r3) -; CHECK: lxebr %f0, %f0 +; CHECK: lxeb %f0, 0(%r3) +; CHECK: std %f0, 0(%r2) +; CHECK: std %f2, 8(%r2) +; CHECK: br %r14 + %val = load float, float *%ptr + %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, + metadata !"fpexcept.strict") + store fp128 %res, fp128 *%dst + ret void +} + +; Check the high end of the aligned LXEB range. +define void @f3(fp128 *%dst, float *%base) { +; CHECK-LABEL: f3: +; CHECK: lxeb %f0, 4092(%r3) +; CHECK: std %f0, 0(%r2) +; CHECK: std %f2, 8(%r2) +; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i64 1023 + %val = load float, float *%ptr + %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, + metadata !"fpexcept.strict") + store fp128 %res, fp128 *%dst + ret void +} + +; Check the next word up, which needs separate address logic. +; Other sequences besides this one would be OK. +define void @f4(fp128 *%dst, float *%base) { +; CHECK-LABEL: f4: +; CHECK: aghi %r3, 4096 +; CHECK: lxeb %f0, 0(%r3) +; CHECK: std %f0, 0(%r2) +; CHECK: std %f2, 8(%r2) +; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i64 1024 + %val = load float, float *%ptr + %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, + metadata !"fpexcept.strict") + store fp128 %res, fp128 *%dst + ret void +} + +; Check negative displacements, which also need separate address logic. +define void @f5(fp128 *%dst, float *%base) { +; CHECK-LABEL: f5: +; CHECK: aghi %r3, -4 +; CHECK: lxeb %f0, 0(%r3) ; CHECK: std %f0, 0(%r2) ; CHECK: std %f2, 8(%r2) ; CHECK: br %r14 + %ptr = getelementptr float, float *%base, i64 -1 %val = load float, float *%ptr %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, metadata !"fpexcept.strict") @@ -33,3 +79,20 @@ define void @f2(fp128 *%dst, float *%ptr) { ret void } +; Check that LXEB allows indices. +define void @f6(fp128 *%dst, float *%base, i64 %index) { +; CHECK-LABEL: f6: +; CHECK: sllg %r1, %r4, 2 +; CHECK: lxeb %f0, 400(%r1,%r3) +; CHECK: std %f0, 0(%r2) +; CHECK: std %f2, 8(%r2) +; CHECK: br %r14 + %ptr1 = getelementptr float, float *%base, i64 %index + %ptr2 = getelementptr float, float *%ptr1, i64 100 + %val = load float, float *%ptr2 + %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, + metadata !"fpexcept.strict") + store fp128 %res, fp128 *%dst + ret void +} + diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll index 5c67fff..657cdcd 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll @@ -17,15 +17,61 @@ define void @f1(fp128 *%dst, double %val) { ret void } -; Check extension from memory. -; FIXME: This should really use LXDB, but there is no strict "extload" yet. +; Check the low end of the LXDB range. define void @f2(fp128 *%dst, double *%ptr) { ; CHECK-LABEL: f2: -; CHECK: ld %f0, 0(%r3) -; CHECK: lxdbr %f0, %f0 +; CHECK: lxdb %f0, 0(%r3) +; CHECK: std %f0, 0(%r2) +; CHECK: std %f2, 8(%r2) +; CHECK: br %r14 + %val = load double, double *%ptr + %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, + metadata !"fpexcept.strict") + store fp128 %res, fp128 *%dst + ret void +} + +; Check the high end of the aligned LXDB range. +define void @f3(fp128 *%dst, double *%base) { +; CHECK-LABEL: f3: +; CHECK: lxdb %f0, 4088(%r3) +; CHECK: std %f0, 0(%r2) +; CHECK: std %f2, 8(%r2) +; CHECK: br %r14 + %ptr = getelementptr double, double *%base, i64 511 + %val = load double, double *%ptr + %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, + metadata !"fpexcept.strict") + store fp128 %res, fp128 *%dst + ret void +} + +; Check the next doubleword up, which needs separate address logic. +; Other sequences besides this one would be OK. +define void @f4(fp128 *%dst, double *%base) { +; CHECK-LABEL: f4: +; CHECK: aghi %r3, 4096 +; CHECK: lxdb %f0, 0(%r3) +; CHECK: std %f0, 0(%r2) +; CHECK: std %f2, 8(%r2) +; CHECK: br %r14 + %ptr = getelementptr double, double *%base, i64 512 + %val = load double, double *%ptr + %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, + metadata !"fpexcept.strict") + store fp128 %res, fp128 *%dst + ret void +} + +; Check negative displacements, which also need separate address logic. +define void @f5(fp128 *%dst, double *%base) { +; CHECK-LABEL: f5: +; CHECK: aghi %r3, -8 +; CHECK: lxdb %f0, 0(%r3) ; CHECK: std %f0, 0(%r2) ; CHECK: std %f2, 8(%r2) ; CHECK: br %r14 + %ptr = getelementptr double, double *%base, i64 -1 %val = load double, double *%ptr %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, metadata !"fpexcept.strict") @@ -33,3 +79,20 @@ define void @f2(fp128 *%dst, double *%ptr) { ret void } +; Check that LXDB allows indices. +define void @f6(fp128 *%dst, double *%base, i64 %index) { +; CHECK-LABEL: f6: +; CHECK: sllg %r1, %r4, 3 +; CHECK: lxdb %f0, 800(%r1,%r3) +; CHECK: std %f0, 0(%r2) +; CHECK: std %f2, 8(%r2) +; CHECK: br %r14 + %ptr1 = getelementptr double, double *%base, i64 %index + %ptr2 = getelementptr double, double *%ptr1, i64 100 + %val = load double, double *%ptr2 + %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, + metadata !"fpexcept.strict") + store fp128 %res, fp128 *%dst + ret void +} + diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll index aab60ce..ec1e674 100644 --- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll @@ -5504,15 +5504,14 @@ define <1 x double> @constrained_vector_fpext_v1f32() { ; S390X-LABEL: constrained_vector_fpext_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI99_0 -; S390X-NEXT: le %f0, 0(%r1) -; S390X-NEXT: ldebr %f0, %f0 +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fpext_v1f32: ; SZ13: # %bb.0: # %entry ; SZ13-NEXT: larl %r1, .LCPI99_0 -; SZ13-NEXT: lde %f0, 0(%r1) -; SZ13-NEXT: wldeb %v24, %f0 +; SZ13-NEXT: ldeb %f0, 0(%r1) +; SZ13-NEXT: vlr %v24, %v0 ; SZ13-NEXT: br %r14 entry: %result = call <1 x double> @llvm.experimental.constrained.fpext.v1f64.v1f32( @@ -5525,21 +5524,17 @@ define <2 x double> @constrained_vector_fpext_v2f32() { ; S390X-LABEL: constrained_vector_fpext_v2f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI100_0 -; S390X-NEXT: le %f0, 0(%r1) +; S390X-NEXT: ldeb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI100_1 -; S390X-NEXT: le %f1, 0(%r1) -; S390X-NEXT: ldebr %f2, %f0 -; S390X-NEXT: ldebr %f0, %f1 +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fpext_v2f32: ; SZ13: # %bb.0: # %entry ; SZ13-NEXT: larl %r1, .LCPI100_0 -; SZ13-NEXT: lde %f0, 0(%r1) +; SZ13-NEXT: ldeb %f0, 0(%r1) ; SZ13-NEXT: larl %r1, .LCPI100_1 -; SZ13-NEXT: lde %f1, 0(%r1) -; SZ13-NEXT: ldebr %f0, %f0 -; SZ13-NEXT: ldebr %f1, %f1 +; SZ13-NEXT: ldeb %f1, 0(%r1) ; SZ13-NEXT: vmrhg %v24, %v1, %v0 ; SZ13-NEXT: br %r14 entry: @@ -5553,16 +5548,15 @@ define void @constrained_vector_fpext_v3f64(<3 x float>* %src, <3 x double>* %de ; S390X-LABEL: constrained_vector_fpext_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: lg %r0, 0(%r2) -; S390X-NEXT: le %f0, 8(%r2) ; S390X-NEXT: sllg %r1, %r0, 32 -; S390X-NEXT: ldgr %f1, %r1 +; S390X-NEXT: ldgr %f0, %r1 ; S390X-NEXT: nilf %r0, 0 +; S390X-NEXT: ldeb %f1, 8(%r2) ; S390X-NEXT: ldgr %f2, %r0 ; S390X-NEXT: ldebr %f2, %f2 -; S390X-NEXT: ldebr %f1, %f1 ; S390X-NEXT: ldebr %f0, %f0 -; S390X-NEXT: std %f0, 16(%r3) -; S390X-NEXT: std %f1, 8(%r3) +; S390X-NEXT: std %f1, 16(%r3) +; S390X-NEXT: std %f0, 8(%r3) ; S390X-NEXT: std %f2, 0(%r3) ; S390X-NEXT: br %r14 ; @@ -5591,34 +5585,26 @@ define <4 x double> @constrained_vector_fpext_v4f32() { ; S390X-LABEL: constrained_vector_fpext_v4f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI102_0 -; S390X-NEXT: le %f0, 0(%r1) +; S390X-NEXT: ldeb %f6, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI102_1 -; S390X-NEXT: le %f1, 0(%r1) +; S390X-NEXT: ldeb %f4, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI102_2 -; S390X-NEXT: le %f2, 0(%r1) +; S390X-NEXT: ldeb %f2, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI102_3 -; S390X-NEXT: le %f3, 0(%r1) -; S390X-NEXT: ldebr %f6, %f0 -; S390X-NEXT: ldebr %f4, %f1 -; S390X-NEXT: ldebr %f2, %f2 -; S390X-NEXT: ldebr %f0, %f3 +; S390X-NEXT: ldeb %f0, 0(%r1) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fpext_v4f32: ; SZ13: # %bb.0: # %entry ; SZ13-NEXT: larl %r1, .LCPI102_0 -; SZ13-NEXT: lde %f0, 0(%r1) +; SZ13-NEXT: ldeb %f0, 0(%r1) ; SZ13-NEXT: larl %r1, .LCPI102_1 -; SZ13-NEXT: lde %f1, 0(%r1) -; SZ13-NEXT: ldebr %f0, %f0 -; SZ13-NEXT: ldebr %f1, %f1 +; SZ13-NEXT: ldeb %f1, 0(%r1) ; SZ13-NEXT: larl %r1, .LCPI102_2 ; SZ13-NEXT: vmrhg %v24, %v1, %v0 -; SZ13-NEXT: lde %f0, 0(%r1) +; SZ13-NEXT: ldeb %f0, 0(%r1) ; SZ13-NEXT: larl %r1, .LCPI102_3 -; SZ13-NEXT: lde %f1, 0(%r1) -; SZ13-NEXT: ldebr %f0, %f0 -; SZ13-NEXT: ldebr %f1, %f1 +; SZ13-NEXT: ldeb %f1, 0(%r1) ; SZ13-NEXT: vmrhg %v26, %v1, %v0 ; SZ13-NEXT: br %r14 entry: -- 2.7.4