DAGCombiner optimization for pow(x,0.75) and pow(x,0.25) on double and single precisi...

author Masoud Ataei <msd.ataei@gmail.com>

Fri, 12 Jun 2020 14:02:16 +0000 (10:02 -0400)

committer Masoud Ataei <msd.ataei@gmail.com>

Fri, 12 Jun 2020 14:02:16 +0000 (10:02 -0400)
author Masoud Ataei <msd.ataei@gmail.com>
Fri, 12 Jun 2020 14:02:16 +0000 (10:02 -0400)
committer Masoud Ataei <msd.ataei@gmail.com>
Fri, 12 Jun 2020 14:02:16 +0000 (10:02 -0400)
diff --git a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp

index 429b8a3..2b0e604 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
@@ -54,6 +54,7 @@ private:
    static StringRef getCPUSuffix(const PPCSubtarget *Subtarget);
    static std::string createMASSVFuncName(Function &Func,
                                           const PPCSubtarget *Subtarget);
+  bool handlePowSpecialCases(CallInst *CI, Function &Func, Module &M);
    bool lowerMASSVCall(CallInst *CI, Function &Func, Module &M,
                        const PPCSubtarget *Subtarget);
  };
@@ -96,6 +97,34 @@ PPCLowerMASSVEntries::createMASSVFuncName(Function &Func,
    return MASSVEntryName;
  }
  
+/// If there are proper fast-math flags, this function creates llvm.pow
+/// intrinsics when the exponent is 0.25 or 0.75.
+bool PPCLowerMASSVEntries::handlePowSpecialCases(CallInst *CI, Function &Func,
+                                                 Module &M) {
+  if (Func.getName() != "__powf4_massv" && Func.getName() != "__powd2_massv")
+    return false;
+
+  if (Constant *Exp = dyn_cast<Constant>(CI->getArgOperand(1)))
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(Exp->getSplatValue())) {
+      // If the argument is 0.75 or 0.25 it is cheaper to turn it into pow
+      // intrinsic so that it could be optimzed as sequence of sqrt's.
+      if (!CI->hasNoInfs() || !CI->hasApproxFunc())
+        return false;
+
+      if (!CFP->isExactlyValue(0.75) && !CFP->isExactlyValue(0.25))
+        return false;
+
+      if (CFP->isExactlyValue(0.25) && !CI->hasNoSignedZeros())
+        return false;
+
+      CI->setCalledFunction(
+          Intrinsic::getDeclaration(&M, Intrinsic::pow, CI->getType()));
+      return true;
+    }
+
+  return false;
+}
+
  /// Lowers generic MASSV entries to PowerPC subtarget-specific MASSV entries.
  /// e.g.: __sind2_massv --> __sind2_P9 for a Power9 subtarget.
  /// Both function prototypes and their callsites are updated during lowering.
@@ -105,6 +134,10 @@ bool PPCLowerMASSVEntries::lowerMASSVCall(CallInst *CI, Function &Func,
    if (CI->use_empty())
      return false;
  
+  // Handling pow(x, 0.25), pow(x, 0.75), powf(x, 0.25), powf(x, 0.75)
+  if (handlePowSpecialCases(CI, Func, M))
+    return true;
+
    std::string MASSVEntryName = createMASSVFuncName(Func, Subtarget);
    FunctionCallee FCache = M.getOrInsertFunction(
        MASSVEntryName, Func.getFunctionType(), Func.getAttributes());
diff --git a/llvm/test/CodeGen/PowerPC/pow_massv_075_025exp.ll b/llvm/test/CodeGen/PowerPC/pow_massv_075_025exp.ll

new file mode 100644 (file)

index 0000000..43b0477
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pow_massv_075_025exp.ll
@@ -0,0 +1,166 @@
+; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 | FileCheck -check-prefixes=CHECK-PWR9 %s
+; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 | FileCheck -check-prefixes=CHECK-PWR8 %s
+
+; Exponent is a variable
+define void @my_vpow_var(double* nocapture %z, double* nocapture readonly %y, double* nocapture readonly %x) {
+; CHECK-LABEL:       @vspow_var
+; CHECK-PWR9:        bl __powd2_P9
+; CHECK-PWR8:        bl __powd2_P8
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr double, double* %z, i64 %index
+  %next.gep31 = getelementptr double, double* %y, i64 %index
+  %next.gep32 = getelementptr double, double* %x, i64 %index
+  %0 = bitcast double* %next.gep32 to <2 x double>*
+  %wide.load = load <2 x double>, <2 x double>* %0, align 8
+  %1 = bitcast double* %next.gep31 to <2 x double>*
+  %wide.load33 = load <2 x double>, <2 x double>* %1, align 8
+  %2 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> %wide.load33)
+  %3 = bitcast double* %next.gep to <2 x double>*
+  store <2 x double> %2, <2 x double>* %3, align 8
+  %index.next = add i64 %index, 2
+  %4 = icmp eq i64 %index.next, 1024
+  br i1 %4, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Exponent is a constant != 0.75 and !=0.25
+define void @my_vpow_const(double* nocapture %y, double* nocapture readonly %x) {
+; CHECK-LABEL:       @vspow_const
+; CHECK-PWR9:        bl __powd2_P9
+; CHECK-PWR8:        bl __powd2_P8
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr double, double* %y, i64 %index
+  %next.gep19 = getelementptr double, double* %x, i64 %index
+  %0 = bitcast double* %next.gep19 to <2 x double>*
+  %wide.load = load <2 x double>, <2 x double>* %0, align 8
+  %1 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 7.600000e-01, double 7.600000e-01>)
+  %2 = bitcast double* %next.gep to <2 x double>*
+  store <2 x double> %1, <2 x double>* %2, align 8
+  %index.next = add i64 %index, 2
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Exponent is 0.75
+define void @my_vpow_075(double* nocapture %y, double* nocapture readonly %x) {
+; CHECK-LABEL:       @vspow_075
+; CHECK-NOT:         bl __powd2_P{{[8,9]}}
+; CHECK:             xvrsqrtesp
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr double, double* %y, i64 %index
+  %next.gep19 = getelementptr double, double* %x, i64 %index
+  %0 = bitcast double* %next.gep19 to <2 x double>*
+  %wide.load = load <2 x double>, <2 x double>* %0, align 8
+  %1 = call ninf afn <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 7.500000e-01, double 7.500000e-01>)
+  %2 = bitcast double* %next.gep to <2 x double>*
+  store <2 x double> %1, <2 x double>* %2, align 8
+  %index.next = add i64 %index, 2
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Exponent is 0.25
+define void @my_vpow_025(double* nocapture %y, double* nocapture readonly %x) {
+; CHECK-LABEL:       @vspow_025
+; CHECK-NOT:         bl __powd2_P{{[8,9]}}
+; CHECK:             xvrsqrtesp
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr double, double* %y, i64 %index
+  %next.gep19 = getelementptr double, double* %x, i64 %index
+  %0 = bitcast double* %next.gep19 to <2 x double>*
+  %wide.load = load <2 x double>, <2 x double>* %0, align 8
+  %1 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 2.500000e-01, double 2.500000e-01>)
+  %2 = bitcast double* %next.gep to <2 x double>*
+  store <2 x double> %1, <2 x double>* %2, align 8
+  %index.next = add i64 %index, 2
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Exponent is 0.75 but no proper fast-math flags
+define void @my_vpow_075_nofast(double* nocapture %y, double* nocapture readonly %x) {
+; CHECK-LABEL:       @vspow_075_nofast
+; CHECK-PWR9:        bl __powd2_P9
+; CHECK-PWR8:        bl __powd2_P8
+; CHECK-NOT:         xvrsqrtesp
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr double, double* %y, i64 %index
+  %next.gep19 = getelementptr double, double* %x, i64 %index
+  %0 = bitcast double* %next.gep19 to <2 x double>*
+  %wide.load = load <2 x double>, <2 x double>* %0, align 8
+  %1 = call <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 7.500000e-01, double 7.500000e-01>)
+  %2 = bitcast double* %next.gep to <2 x double>*
+  store <2 x double> %1, <2 x double>* %2, align 8
+  %index.next = add i64 %index, 2
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Exponent is 0.25 but no proper fast-math flags
+define void @my_vpow_025_nofast(double* nocapture %y, double* nocapture readonly %x) {
+; CHECK-LABEL:       @vspow_025_nofast
+; CHECK-PWR9:        bl __powd2_P9
+; CHECK-PWR8:        bl __powd2_P8
+; CHECK-NOT:         xvrsqrtesp
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr double, double* %y, i64 %index
+  %next.gep19 = getelementptr double, double* %x, i64 %index
+  %0 = bitcast double* %next.gep19 to <2 x double>*
+  %wide.load = load <2 x double>, <2 x double>* %0, align 8
+  %1 = call <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 2.500000e-01, double 2.500000e-01>)
+  %2 = bitcast double* %next.gep to <2 x double>*
+  store <2 x double> %1, <2 x double>* %2, align 8
+  %index.next = add i64 %index, 2
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <2 x double> @__powd2_massv(<2 x double>, <2 x double>) #1
diff --git a/llvm/test/CodeGen/PowerPC/powf_massv_075_025exp.ll b/llvm/test/CodeGen/PowerPC/powf_massv_075_025exp.ll

new file mode 100644 (file)

index 0000000..f251225
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/powf_massv_075_025exp.ll
@@ -0,0 +1,166 @@
+; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 | FileCheck -check-prefixes=CHECK-PWR9 %s
+; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 | FileCheck -check-prefixes=CHECK-PWR8 %s
+
+; Exponent is a variable
+define void @vspow_var(float* nocapture %z, float* nocapture readonly %y, float* nocapture readonly %x)  {
+; CHECK-LABEL:       @vspow_var
+; CHECK-PWR9:        bl __powf4_P9
+; CHECK-PWR8:        bl __powf4_P8
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr float, float* %z, i64 %index
+  %next.gep31 = getelementptr float, float* %y, i64 %index
+  %next.gep32 = getelementptr float, float* %x, i64 %index
+  %0 = bitcast float* %next.gep32 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %0, align 4
+  %1 = bitcast float* %next.gep31 to <4 x float>*
+  %wide.load33 = load <4 x float>, <4 x float>* %1, align 4
+  %2 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> %wide.load33)
+  %3 = bitcast float* %next.gep to <4 x float>*
+  store <4 x float> %2, <4 x float>* %3, align 4
+  %index.next = add i64 %index, 4
+  %4 = icmp eq i64 %index.next, 1024
+  br i1 %4, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Exponent is a constant != 0.75 and !=0.25
+define void @vspow_const(float* nocapture %y, float* nocapture readonly %x)  {
+; CHECK-LABEL:       @vspow_const
+; CHECK-PWR9:        bl __powf4_P9
+; CHECK-PWR8:        bl __powf4_P8
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr float, float* %y, i64 %index
+  %next.gep19 = getelementptr float, float* %x, i64 %index
+  %0 = bitcast float* %next.gep19 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %0, align 4
+  %1 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 0x3FE851EB80000000, float 0x3FE851EB80000000, float 0x3FE851EB80000000, float 0x3FE851EB80000000>)
+  %2 = bitcast float* %next.gep to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  %index.next = add i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Exponent is 0.75
+define void @vspow_075(float* nocapture %y, float* nocapture readonly %x)  {
+; CHECK-LABEL:       @vspow_075
+; CHECK-NOT:         bl __powf4_P{{[8,9]}}
+; CHECK:             xvrsqrtesp
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr float, float* %y, i64 %index
+  %next.gep19 = getelementptr float, float* %x, i64 %index
+  %0 = bitcast float* %next.gep19 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %0, align 4
+  %1 = call ninf afn <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01>)
+  %2 = bitcast float* %next.gep to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  %index.next = add i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Exponent is 0.25
+define void @vspow_025(float* nocapture %y, float* nocapture readonly %x)  {
+; CHECK-LABEL:       @vspow_025
+; CHECK-NOT:         bl __powf4_P{{[8,9]}}
+; CHECK:             xvrsqrtesp
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr float, float* %y, i64 %index
+  %next.gep19 = getelementptr float, float* %x, i64 %index
+  %0 = bitcast float* %next.gep19 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %0, align 4
+  %1 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 2.500000e-01, float 2.500000e-01, float 2.500000e-01, float 2.500000e-01>)
+  %2 = bitcast float* %next.gep to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  %index.next = add i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Exponent is 0.75 but no proper fast-math flags
+define void @vspow_075_nofast(float* nocapture %y, float* nocapture readonly %x)  {
+; CHECK-LABEL:       @vspow_075_nofast
+; CHECK-PWR9:        bl __powf4_P9
+; CHECK-PWR8:        bl __powf4_P8
+; CHECK-NOT:         xvrsqrtesp
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr float, float* %y, i64 %index
+  %next.gep19 = getelementptr float, float* %x, i64 %index
+  %0 = bitcast float* %next.gep19 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %0, align 4
+  %1 = call <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01>)
+  %2 = bitcast float* %next.gep to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  %index.next = add i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Exponent is 0.25 but no proper fast-math flags
+define void @vspow_025_nofast(float* nocapture %y, float* nocapture readonly %x)  {
+; CHECK-LABEL:       @vspow_025_nofast
+; CHECK-PWR9:        bl __powf4_P9
+; CHECK-PWR8:        bl __powf4_P8
+; CHECK-NOT:         xvrsqrtesp
+; CHECK:             blr
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %next.gep = getelementptr float, float* %y, i64 %index
+  %next.gep19 = getelementptr float, float* %x, i64 %index
+  %0 = bitcast float* %next.gep19 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %0, align 4
+  %1 = call <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 2.500000e-01, float 2.500000e-01, float 2.500000e-01, float 2.500000e-01>)
+  %2 = bitcast float* %next.gep to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  %index.next = add i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.end, label %vector.body
+
+for.end:
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <4 x float> @__powf4_massv(<4 x float>, <4 x float>)
author	Masoud Ataei <msd.ataei@gmail.com>
	Fri, 12 Jun 2020 14:02:16 +0000 (10:02 -0400)
committer	Masoud Ataei <msd.ataei@gmail.com>
	Fri, 12 Jun 2020 14:02:16 +0000 (10:02 -0400)
llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp		patch \| blob \| history
llvm/test/CodeGen/PowerPC/pow_massv_075_025exp.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/PowerPC/powf_massv_075_025exp.ll	[new file with mode: 0644]	patch \| blob