From: Yang Rong <rong.r.yang@intel.com>
Date: Mon, 24 Mar 2014 08:27:31 +0000 (+0800)
Subject: Refined the fmax and fmin builtins.
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7d8897fdb0d4d6efd279cd08a47f239695a92814;p=contrib%2Fbeignet.git

Refined the fmax and fmin builtins.

Because GEN's select instruction with cmod .l and .ge will handle NaN case, so
use the compare and select instruction in gen ir for fmax and fmin, and will be
optimized to one sel_cmp, need not check isnan.

Signed-off-by: Yang Rong <rong.r.yang@intel.com>
Reviewed-by: "Zou, Nanhai" <nanhai.zou@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
---

diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 227ef09..1090f97 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2135,6 +2135,8 @@ namespace gbe
       case GEN_OCL_UPSAMPLE_INT:
       case GEN_OCL_UPSAMPLE_LONG:
       case GEN_OCL_MAD:
+      case GEN_OCL_FMAX:
+      case GEN_OCL_FMIN:
       case GEN_OCL_SADD_SAT_CHAR:
       case GEN_OCL_SADD_SAT_SHORT:
       case GEN_OCL_SADD_SAT_INT:
@@ -2623,6 +2625,22 @@ namespace gbe
             ctx.MAD(getType(ctx, I.getType()), dst, src0, src1, src2);
             break;
           }
+          case GEN_OCL_FMAX:
+          case GEN_OCL_FMIN:{
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            const ir::Register cmp = ctx.reg(ir::FAMILY_BOOL);
+            //Becasue cmp's sources are same as sel's source, so cmp instruction and sel
+            //instruction will be merged to one sel_cmp instruction in the gen selection
+            //Add two intruction here for simple.
+            if(it->second == GEN_OCL_FMAX)
+              ctx.GE(getType(ctx, I.getType()), cmp, src0, src1);
+            else
+              ctx.LT(getType(ctx, I.getType()), cmp, src0, src1);
+            ctx.SEL(getType(ctx, I.getType()), dst, cmp, src0, src1);
+            break;
+          }
           case GEN_OCL_HADD: {
             GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 00d69f0..5bf794a 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -33,6 +33,8 @@ DECL_LLVM_GEN_FUNCTION(RNDE, __gen_ocl_rnde)
 DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu)
 DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)
 DECL_LLVM_GEN_FUNCTION(MAD, __gen_ocl_mad)
+DECL_LLVM_GEN_FUNCTION(FMAX, __gen_ocl_fmax)
+DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
 
 // Barrier function
 DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local)
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index 25f2ff7..50107d8 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -3169,6 +3169,8 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_exp10(float x){
 #define remainder __gen_ocl_internal_remainder
 #define ldexp __gen_ocl_internal_ldexp
 PURE CONST float __gen_ocl_mad(float a, float b, float c);
+PURE CONST float __gen_ocl_fmax(float a, float b);
+PURE CONST float __gen_ocl_fmin(float a, float b);
 INLINE_OVERLOADABLE float mad(float a, float b, float c) {
   return __gen_ocl_mad(a, b, c);
 }
@@ -3224,14 +3226,10 @@ DECL_MIN_MAX_CLAMP(long)
 DECL_MIN_MAX_CLAMP(ulong)
 #undef DECL_MIN_MAX_CLAMP
 INLINE_OVERLOADABLE float max(float a, float b) {
-  if(isnan(b))
-    return a;
-  return a > b ? a : b;
+  return __gen_ocl_fmax(a, b);
 }
 INLINE_OVERLOADABLE float min(float a, float b) {
-  if(isnan(b))
-    return a;
-  return a < b ? a : b;
+  return __gen_ocl_fmin(a, b);
 }
 INLINE_OVERLOADABLE float clamp(float v, float l, float u) {
   return max(min(v, u), l);