}
// Template for instructions which take three fp64 or fp32 args. The
-// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
+// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
//
// Also defines ftz (flush subnormal inputs and results to sign-preserving
// zero) variants for fp32 functions.
+//
+// This multiclass should be used for nodes that cannot be folded into FMAs.
+// For nodes that can be folded into FMAs (i.e. adds and muls), use
+// F3_fma_component.
multiclass F3<string OpcStr, SDNode OpNode> {
def f64rr :
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b),
!strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+ def f64ri :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b),
+ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+ def f32rr_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[doF32FTZ]>;
+ def f32ri_ftz :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[doF32FTZ]>;
+ def f32rr :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+ def f32ri :
+ NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+}
+
+// Template for instructions which take three fp64 or fp32 args. The
+// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
+//
+// This multiclass should be used for nodes that can be folded to make fma ops.
+// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
+// just like the non ".rn" op, but prevents ptxas from creating FMAs.
+multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
+ def f64rr :
+ NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b),
+ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
[(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
Requires<[allowFMA]>;
def f64ri :
!strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
Requires<[allowFMA]>;
-}
-// Same as F3, but defines ".rn" variants (round to nearest even).
-multiclass F3_rn<string OpcStr, SDNode OpNode> {
- def f64rr :
+ // These have strange names so we don't perturb existing mir tests.
+ def _rnf64rr :
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b),
!strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
[(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
Requires<[noFMA]>;
- def f64ri :
+ def _rnf64ri :
NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, f64imm:$b),
!strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
[(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
Requires<[noFMA]>;
- def f32rr_ftz :
+ def _rnf32rr_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
!strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
Requires<[noFMA, doF32FTZ]>;
- def f32ri_ftz :
+ def _rnf32ri_ftz :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
Requires<[noFMA, doF32FTZ]>;
- def f32rr :
+ def _rnf32rr :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
!strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
Requires<[noFMA]>;
- def f32ri :
+ def _rnf32ri :
NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
N->getValueAPF().convertToDouble() == 1.0;
}]>;
-defm FADD : F3<"add", fadd>;
-defm FSUB : F3<"sub", fsub>;
-defm FMUL : F3<"mul", fmul>;
+defm FADD : F3_fma_component<"add", fadd>;
+defm FSUB : F3_fma_component<"sub", fsub>;
+defm FMUL : F3_fma_component<"mul", fmul>;
-defm FADD_rn : F3_rn<"add", fadd>;
-defm FSUB_rn : F3_rn<"sub", fsub>;
-defm FMUL_rn : F3_rn<"mul", fmul>;
+defm FMIN : F3<"min", fminnum>;
+defm FMAX : F3<"max", fmaxnum>;
defm FABS : F2<"abs", fabs>;
defm FNEG : F2<"neg", fneg>;
def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
+// fceil, ffloor, fround, ftrunc.
+
+def : Pat<(fceil Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fceil Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fceil Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(ffloor Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ffloor Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(ffloor Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
+
+def : Pat<(fround Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fround Float32Regs:$a)),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(f64 (fround Float64Regs:$a)),
+ (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+def : Pat<(ftrunc Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ftrunc Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(ftrunc Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
+
+// nearbyint and rint are implemented as rounding to nearest even. This isn't
+// strictly correct, because it causes us to ignore the rounding mode. But it
+// matches what CUDA's "libm" does.
+
+def : Pat<(fnearbyint Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fnearbyint Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fnearbyint Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+def : Pat<(frint Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(frint Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(frint Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+
//-----------------------------------
// Control-flow
//-----------------------------------
--- /dev/null
+; RUN: llc < %s | FileCheck %s
+target triple = "nvptx64-nvidia-cuda"
+
+; Checks that llvm intrinsics for math functions are correctly lowered to PTX.
+
+declare float @llvm.ceil.f32(float) #0
+declare double @llvm.ceil.f64(double) #0
+declare float @llvm.floor.f32(float) #0
+declare double @llvm.floor.f64(double) #0
+declare float @llvm.round.f32(float) #0
+declare double @llvm.round.f64(double) #0
+declare float @llvm.nearbyint.f32(float) #0
+declare double @llvm.nearbyint.f64(double) #0
+declare float @llvm.rint.f32(float) #0
+declare double @llvm.rint.f64(double) #0
+declare float @llvm.trunc.f32(float) #0
+declare double @llvm.trunc.f64(double) #0
+declare float @llvm.fabs.f32(float) #0
+declare double @llvm.fabs.f64(double) #0
+declare float @llvm.minnum.f32(float, float) #0
+declare double @llvm.minnum.f64(double, double) #0
+declare float @llvm.maxnum.f32(float, float) #0
+declare double @llvm.maxnum.f64(double, double) #0
+
+; ---- ceil ----
+
+; CHECK-LABEL: ceil_float
+define float @ceil_float(float %a) {
+ ; CHECK: cvt.rpi.f32.f32
+ %b = call float @llvm.ceil.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: ceil_float_ftz
+define float @ceil_float_ftz(float %a) #1 {
+ ; CHECK: cvt.rpi.ftz.f32.f32
+ %b = call float @llvm.ceil.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: ceil_double
+define double @ceil_double(double %a) {
+ ; CHECK: cvt.rpi.f64.f64
+ %b = call double @llvm.ceil.f64(double %a)
+ ret double %b
+}
+
+; ---- floor ----
+
+; CHECK-LABEL: floor_float
+define float @floor_float(float %a) {
+ ; CHECK: cvt.rmi.f32.f32
+ %b = call float @llvm.floor.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: floor_float_ftz
+define float @floor_float_ftz(float %a) #1 {
+ ; CHECK: cvt.rmi.ftz.f32.f32
+ %b = call float @llvm.floor.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: floor_double
+define double @floor_double(double %a) {
+ ; CHECK: cvt.rmi.f64.f64
+ %b = call double @llvm.floor.f64(double %a)
+ ret double %b
+}
+
+; ---- round ----
+
+; CHECK-LABEL: round_float
+define float @round_float(float %a) {
+ ; CHECK: cvt.rni.f32.f32
+ %b = call float @llvm.round.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: round_float_ftz
+define float @round_float_ftz(float %a) #1 {
+ ; CHECK: cvt.rni.ftz.f32.f32
+ %b = call float @llvm.round.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: round_double
+define double @round_double(double %a) {
+ ; CHECK: cvt.rni.f64.f64
+ %b = call double @llvm.round.f64(double %a)
+ ret double %b
+}
+
+; ---- nearbyint ----
+
+; CHECK-LABEL: nearbyint_float
+define float @nearbyint_float(float %a) {
+ ; CHECK: cvt.rni.f32.f32
+ %b = call float @llvm.nearbyint.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: nearbyint_float_ftz
+define float @nearbyint_float_ftz(float %a) #1 {
+ ; CHECK: cvt.rni.ftz.f32.f32
+ %b = call float @llvm.nearbyint.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: nearbyint_double
+define double @nearbyint_double(double %a) {
+ ; CHECK: cvt.rni.f64.f64
+ %b = call double @llvm.nearbyint.f64(double %a)
+ ret double %b
+}
+
+; ---- rint ----
+
+; CHECK-LABEL: rint_float
+define float @rint_float(float %a) {
+ ; CHECK: cvt.rni.f32.f32
+ %b = call float @llvm.rint.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: rint_float_ftz
+define float @rint_float_ftz(float %a) #1 {
+ ; CHECK: cvt.rni.ftz.f32.f32
+ %b = call float @llvm.rint.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: rint_double
+define double @rint_double(double %a) {
+ ; CHECK: cvt.rni.f64.f64
+ %b = call double @llvm.rint.f64(double %a)
+ ret double %b
+}
+
+; ---- trunc ----
+
+; CHECK-LABEL: trunc_float
+define float @trunc_float(float %a) {
+ ; CHECK: cvt.rzi.f32.f32
+ %b = call float @llvm.trunc.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: trunc_float_ftz
+define float @trunc_float_ftz(float %a) #1 {
+ ; CHECK: cvt.rzi.ftz.f32.f32
+ %b = call float @llvm.trunc.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: trunc_double
+define double @trunc_double(double %a) {
+ ; CHECK: cvt.rzi.f64.f64
+ %b = call double @llvm.trunc.f64(double %a)
+ ret double %b
+}
+
+; ---- abs ----
+
+; CHECK-LABEL: abs_float
+define float @abs_float(float %a) {
+ ; CHECK: abs.f32
+ %b = call float @llvm.fabs.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: abs_float_ftz
+define float @abs_float_ftz(float %a) #1 {
+ ; CHECK: abs.ftz.f32
+ %b = call float @llvm.fabs.f32(float %a)
+ ret float %b
+}
+
+; CHECK-LABEL: abs_double
+define double @abs_double(double %a) {
+ ; CHECK: abs.f64
+ %b = call double @llvm.fabs.f64(double %a)
+ ret double %b
+}
+
+; ---- min ----
+
+; CHECK-LABEL: min_float
+define float @min_float(float %a, float %b) {
+ ; CHECK: min.f32
+ %x = call float @llvm.minnum.f32(float %a, float %b)
+ ret float %x
+}
+
+; CHECK-LABEL: min_imm1
+define float @min_imm1(float %a) {
+ ; CHECK: min.f32
+ %x = call float @llvm.minnum.f32(float %a, float 0.0)
+ ret float %x
+}
+
+; CHECK-LABEL: min_imm2
+define float @min_imm2(float %a) {
+ ; CHECK: min.f32
+ %x = call float @llvm.minnum.f32(float 0.0, float %a)
+ ret float %x
+}
+
+; CHECK-LABEL: min_float_ftz
+define float @min_float_ftz(float %a, float %b) #1 {
+ ; CHECK: min.ftz.f32
+ %x = call float @llvm.minnum.f32(float %a, float %b)
+ ret float %x
+}
+
+; CHECK-LABEL: min_double
+define double @min_double(double %a, double %b) {
+ ; CHECK: min.f64
+ %x = call double @llvm.minnum.f64(double %a, double %b)
+ ret double %x
+}
+
+; ---- max ----
+
+; CHECK-LABEL: max_imm1
+define float @max_imm1(float %a) {
+ ; CHECK: max.f32
+ %x = call float @llvm.maxnum.f32(float %a, float 0.0)
+ ret float %x
+}
+
+; CHECK-LABEL: max_imm2
+define float @max_imm2(float %a) {
+ ; CHECK: max.f32
+ %x = call float @llvm.maxnum.f32(float 0.0, float %a)
+ ret float %x
+}
+
+; CHECK-LABEL: max_float
+define float @max_float(float %a, float %b) {
+ ; CHECK: max.f32
+ %x = call float @llvm.maxnum.f32(float %a, float %b)
+ ret float %x
+}
+
+; CHECK-LABEL: max_float_ftz
+define float @max_float_ftz(float %a, float %b) #1 {
+ ; CHECK: max.ftz.f32
+ %x = call float @llvm.maxnum.f32(float %a, float %b)
+ ret float %x
+}
+
+; CHECK-LABEL: max_double
+define double @max_double(double %a, double %b) {
+ ; CHECK: max.f64
+ %x = call double @llvm.maxnum.f64(double %a, double %b)
+ ret double %x
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { "nvptx-f32ftz" = "true" }