From 03a4702c884a0498db902aff34ebb19c48b4696b Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 13 Jul 2021 17:08:05 +0100 Subject: [PATCH] [RISCV] Fix the neutral element in vector 'fadd' reductions Using positive zero as the neutral element in 'fadd' reductions, while it generates better code, is incorrect. The correct neutral element is negative zero: 0.0 + -0.0 = 0.0, whereas -0.0 + -0.0 = -0.0. There are perhaps more optimal lowerings of negative zero avoiding constant-pool loads which could be left as future work. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D105902 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- .../RISCV/rvv/fixed-vectors-reduction-fp.ll | 187 +++++++++++++++------ .../CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll | 36 +++- 3 files changed, 162 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 9d2ae67..e6527a9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3718,7 +3718,7 @@ getRVVFPReductionOpAndOperands(SDValue Op, SelectionDAG &DAG, EVT EltVT) { llvm_unreachable("Unhandled reduction"); case ISD::VECREDUCE_FADD: return std::make_tuple(RISCVISD::VECREDUCE_FADD_VL, Op.getOperand(0), - DAG.getConstantFP(0.0, DL, EltVT)); + DAG.getNeutralElement(BaseOpcode, DL, EltVT, Flags)); case ISD::VECREDUCE_SEQ_FADD: return std::make_tuple(RISCVISD::VECREDUCE_SEQ_FADD_VL, Op.getOperand(1), Op.getOperand(0)); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index 76dabfc..b4219cd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 declare half @llvm.vector.reduce.fadd.v1f16(half, <1 x half>) @@ -38,10 +38,12 @@ declare half @llvm.vector.reduce.fadd.v2f16(half, <2 x half>) define half @vreduce_fadd_v2f16(<2 x half>* %x, half %s) { ; CHECK-LABEL: vreduce_fadd_v2f16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI2_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI2_0)(a1) ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v25, (a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vfmv.v.f v26, ft0 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v25, v26 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -73,10 +75,12 @@ declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>) define half @vreduce_fadd_v4f16(<4 x half>* %x, half %s) { ; CHECK-LABEL: vreduce_fadd_v4f16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI4_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI4_0)(a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v25, (a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vfmv.v.f v26, ft0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v25, v26 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -108,10 +112,12 @@ declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>) define half @vreduce_fadd_v8f16(<8 x half>* %x, half %s) { ; CHECK-LABEL: vreduce_fadd_v8f16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI6_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI6_0)(a1) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v25, (a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vfmv.v.f v26, ft0 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v25, v26 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -143,10 +149,12 @@ declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>) define half @vreduce_fadd_v16f16(<16 x half>* %x, half %s) { ; CHECK-LABEL: vreduce_fadd_v16f16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI8_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI8_0)(a1) ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v26, (a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v26, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -176,18 +184,35 @@ define half @vreduce_ord_fadd_v16f16(<16 x half>* %x, half %s) { declare half @llvm.vector.reduce.fadd.v32f16(half, <32 x half>) define half @vreduce_fadd_v32f16(<32 x half>* %x, half %s) { -; CHECK-LABEL: vreduce_fadd_v32f16: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, zero, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vle16.v v28, (a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vfredsum.vs v25, v28, v25 -; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fadd_v32f16: +; RV32: # %bb.0: +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: lui a2, %hi(.LCPI10_0) +; RV32-NEXT: flh ft0, %lo(.LCPI10_0)(a2) +; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; RV32-NEXT: vle16.v v28, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vfmv.v.f v25, ft0 +; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; RV32-NEXT: vfredsum.vs v25, v28, v25 +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.h fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_v32f16: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, %hi(.LCPI10_0) +; RV64-NEXT: flh ft0, %lo(.LCPI10_0)(a1) +; RV64-NEXT: addi a1, zero, 32 +; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; RV64-NEXT: vle16.v v28, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vfmv.v.f v25, ft0 +; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; RV64-NEXT: vfredsum.vs v25, v28, v25 +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.h fa0, fa0, ft0 +; RV64-NEXT: ret %v = load <32 x half>, <32 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v32f16(half %s, <32 x half> %v) ret half %red @@ -213,18 +238,35 @@ define half @vreduce_ord_fadd_v32f16(<32 x half>* %x, half %s) { declare half @llvm.vector.reduce.fadd.v64f16(half, <64 x half>) define half @vreduce_fadd_v64f16(<64 x half>* %x, half %s) { -; CHECK-LABEL: vreduce_fadd_v64f16: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, zero, 64 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: vfredsum.vs v25, v8, v25 -; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fadd_v64f16: +; RV32: # %bb.0: +; RV32-NEXT: addi a1, zero, 64 +; RV32-NEXT: lui a2, %hi(.LCPI12_0) +; RV32-NEXT: flh ft0, %lo(.LCPI12_0)(a2) +; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: vfmv.v.f v25, ft0 +; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; RV32-NEXT: vfredsum.vs v25, v8, v25 +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.h fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_v64f16: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, %hi(.LCPI12_0) +; RV64-NEXT: flh ft0, %lo(.LCPI12_0)(a1) +; RV64-NEXT: addi a1, zero, 64 +; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; RV64-NEXT: vle16.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vfmv.v.f v25, ft0 +; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; RV64-NEXT: vfredsum.vs v25, v8, v25 +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.h fa0, fa0, ft0 +; RV64-NEXT: ret %v = load <64 x half>, <64 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v64f16(half %s, <64 x half> %v) ret half %red @@ -257,9 +299,11 @@ define half @vreduce_fadd_v128f16(<128 x half>* %x, half %s) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: lui a0, %hi(.LCPI14_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v16 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -330,10 +374,12 @@ declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) define float @vreduce_fadd_v2f32(<2 x float>* %x, float %s) { ; CHECK-LABEL: vreduce_fadd_v2f32: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI18_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI18_0)(a1) ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v25, (a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vfmv.v.f v26, ft0 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v25, v26 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -365,10 +411,12 @@ declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) define float @vreduce_fadd_v4f32(<4 x float>* %x, float %s) { ; CHECK-LABEL: vreduce_fadd_v4f32: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI20_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI20_0)(a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v25, (a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vfmv.v.f v26, ft0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v25, v26 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -400,10 +448,12 @@ declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) define float @vreduce_fadd_v8f32(<8 x float>* %x, float %s) { ; CHECK-LABEL: vreduce_fadd_v8f32: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI22_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI22_0)(a1) ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v26, (a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v26, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -435,10 +485,12 @@ declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>) define float @vreduce_fadd_v16f32(<16 x float>* %x, float %s) { ; CHECK-LABEL: vreduce_fadd_v16f32: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI24_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI24_0)(a1) ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v28, (a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v28, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -468,18 +520,35 @@ define float @vreduce_ord_fadd_v16f32(<16 x float>* %x, float %s) { declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>) define float @vreduce_fadd_v32f32(<32 x float>* %x, float %s) { -; CHECK-LABEL: vreduce_fadd_v32f32: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, zero, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vfredsum.vs v25, v8, v25 -; CHECK-NEXT: vfmv.f.s ft0, v25 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fadd_v32f32: +; RV32: # %bb.0: +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: lui a2, %hi(.LCPI26_0) +; RV32-NEXT: flw ft0, %lo(.LCPI26_0)(a2) +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV32-NEXT: vfmv.v.f v25, ft0 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV32-NEXT: vfredsum.vs v25, v8, v25 +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.s fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_v32f32: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, %hi(.LCPI26_0) +; RV64-NEXT: flw ft0, %lo(.LCPI26_0)(a1) +; RV64-NEXT: addi a1, zero, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV64-NEXT: vfmv.v.f v25, ft0 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; RV64-NEXT: vfredsum.vs v25, v8, v25 +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.s fa0, fa0, ft0 +; RV64-NEXT: ret %v = load <32 x float>, <32 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v32f32(float %s, <32 x float> %v) ret float %red @@ -512,9 +581,11 @@ define float @vreduce_fadd_v64f32(<64 x float>* %x, float %s) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: lui a0, %hi(.LCPI28_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI28_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v16 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -585,10 +656,12 @@ declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) define double @vreduce_fadd_v2f64(<2 x double>* %x, double %s) { ; CHECK-LABEL: vreduce_fadd_v2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI32_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI32_0)(a1) ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vle64.v v25, (a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vfmv.v.f v26, ft0 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v25, v26 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -620,10 +693,12 @@ declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) define double @vreduce_fadd_v4f64(<4 x double>* %x, double %s) { ; CHECK-LABEL: vreduce_fadd_v4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI34_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vle64.v v26, (a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v26, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -655,10 +730,12 @@ declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>) define double @vreduce_fadd_v8f64(<8 x double>* %x, double %s) { ; CHECK-LABEL: vreduce_fadd_v8f64: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI36_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI36_0)(a1) ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vle64.v v28, (a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v28, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -690,10 +767,12 @@ declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>) define double @vreduce_fadd_v16f64(<16 x double>* %x, double %s) { ; CHECK-LABEL: vreduce_fadd_v16f64: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI38_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI38_0)(a1) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -729,9 +808,11 @@ define double @vreduce_fadd_v32f64(<32 x double>* %x, double %s) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle64.v v16, (a0) +; CHECK-NEXT: lui a0, %hi(.LCPI40_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI40_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v16 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll index f9f7cdd..9c0199b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll @@ -9,8 +9,10 @@ declare half @llvm.vector.reduce.fadd.nxv1f16(half, ) define half @vreduce_fadd_nxv1f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv1f16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI0_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -38,8 +40,10 @@ declare half @llvm.vector.reduce.fadd.nxv2f16(half, ) define half @vreduce_fadd_nxv2f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv2f16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI2_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -67,8 +71,10 @@ declare half @llvm.vector.reduce.fadd.nxv4f16(half, ) define half @vreduce_fadd_nxv4f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv4f16: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI4_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 ; CHECK-NEXT: fadd.h fa0, fa0, ft0 @@ -94,8 +100,10 @@ declare float @llvm.vector.reduce.fadd.nxv1f32(float, ) define float @vreduce_fadd_nxv1f32( %v, float %s) { ; CHECK-LABEL: vreduce_fadd_nxv1f32: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI6_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI6_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -123,8 +131,10 @@ declare float @llvm.vector.reduce.fadd.nxv2f32(float, ) define float @vreduce_fadd_nxv2f32( %v, float %s) { ; CHECK-LABEL: vreduce_fadd_nxv2f32: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI8_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI8_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 ; CHECK-NEXT: fadd.s fa0, fa0, ft0 @@ -150,8 +160,10 @@ declare float @llvm.vector.reduce.fadd.nxv4f32(float, ) define float @vreduce_fadd_nxv4f32( %v, float %s) { ; CHECK-LABEL: vreduce_fadd_nxv4f32: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI10_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI10_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -179,8 +191,10 @@ declare double @llvm.vector.reduce.fadd.nxv1f64(double, ) define double @vreduce_fadd_nxv1f64( %v, double %s) { ; CHECK-LABEL: vreduce_fadd_nxv1f64: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI12_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 ; CHECK-NEXT: fadd.d fa0, fa0, ft0 @@ -206,8 +220,10 @@ declare double @llvm.vector.reduce.fadd.nxv2f64(double, ) define double @vreduce_fadd_nxv2f64( %v, double %s) { ; CHECK-LABEL: vreduce_fadd_nxv2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI14_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 @@ -235,8 +251,10 @@ declare double @llvm.vector.reduce.fadd.nxv4f64(double, ) define double @vreduce_fadd_nxv4f64( %v, double %s) { ; CHECK-LABEL: vreduce_fadd_nxv4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI16_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI16_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vfmv.v.f v25, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vfredsum.vs v25, v8, v25 ; CHECK-NEXT: vfmv.f.s ft0, v25 -- 2.7.4