[Beignet] [PATCH] Refined the fmax and fmin builtins.

Zou, Nanhai nanhai.zou at intel.com
Mon Mar 24 01:28:22 PDT 2014


Looks good to me.
I got about 60%-70% performance improvement with this patch in Luxmark.

Thanks
Zou Nanhai

-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Yang Rong
Sent: Monday, March 24, 2014 4:28 PM
To: beignet at lists.freedesktop.org
Cc: Yang, Rong R
Subject: [Beignet] [PATCH] Refined the fmax and fmin builtins.

Because GEN's select instruction with cmod .l and .ge will handle NaN case, so use the compare and select instruction in gen ir for fmax and fmin, and will be optimized to one sel_cmp, need not check isnan.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/llvm/llvm_gen_backend.cpp      | 18 ++++++++++++++++++
 backend/src/llvm/llvm_gen_ocl_function.hxx |  2 ++
 backend/src/ocl_stdlib.tmpl.h              | 10 ++++------
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 49fbc7b..c459f25 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2134,6 +2134,8 @@ namespace gbe
       case GEN_OCL_UPSAMPLE_INT:
       case GEN_OCL_UPSAMPLE_LONG:
       case GEN_OCL_MAD:
+      case GEN_OCL_FMAX:
+      case GEN_OCL_FMIN:
       case GEN_OCL_SADD_SAT_CHAR:
       case GEN_OCL_SADD_SAT_SHORT:
       case GEN_OCL_SADD_SAT_INT:
@@ -2622,6 +2624,22 @@ namespace gbe
             ctx.MAD(getType(ctx, I.getType()), dst, src0, src1, src2);
             break;
           }
+          case GEN_OCL_FMAX:
+          case GEN_OCL_FMIN:{
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            const ir::Register cmp = ctx.reg(ir::FAMILY_BOOL);
+            //Becasue cmp's sources are same as sel's source, so cmp instruction and sel
+            //instruction will be merged to one sel_cmp instruction in the gen selection
+            //Add two intruction here for simple.
+            if(it->second == GEN_OCL_FMAX)
+              ctx.GE(getType(ctx, I.getType()), cmp, src0, src1);
+            else
+              ctx.LT(getType(ctx, I.getType()), cmp, src0, src1);
+            ctx.SEL(getType(ctx, I.getType()), dst, cmp, src0, src1);
+            break;
+          }
           case GEN_OCL_HADD: {
             GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI; diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 00d69f0..5bf794a 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -33,6 +33,8 @@ DECL_LLVM_GEN_FUNCTION(RNDE, __gen_ocl_rnde)  DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu)  DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)  DECL_LLVM_GEN_FUNCTION(MAD, __gen_ocl_mad)
+DECL_LLVM_GEN_FUNCTION(FMAX, __gen_ocl_fmax) 
+DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
 
 // Barrier function
 DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local) diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h index e3ac632..e823b5f 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -3169,6 +3169,8 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_exp10(float x){  #define remainder __gen_ocl_internal_remainder  #define ldexp __gen_ocl_internal_ldexp  PURE CONST float __gen_ocl_mad(float a, float b, float c);
+PURE CONST float __gen_ocl_fmax(float a, float b); PURE CONST float 
+__gen_ocl_fmin(float a, float b);
 INLINE_OVERLOADABLE float mad(float a, float b, float c) {
   return __gen_ocl_mad(a, b, c);
 }
@@ -3224,14 +3226,10 @@ DECL_MIN_MAX_CLAMP(long)
 DECL_MIN_MAX_CLAMP(ulong)
 #undef DECL_MIN_MAX_CLAMP
 INLINE_OVERLOADABLE float max(float a, float b) {
-  if(isnan(b))
-    return a;
-  return a > b ? a : b;
+  return __gen_ocl_fmax(a, b);
 }
 INLINE_OVERLOADABLE float min(float a, float b) {
-  if(isnan(b))
-    return a;
-  return a < b ? a : b;
+  return __gen_ocl_fmin(a, b);
 }
 INLINE_OVERLOADABLE float clamp(float v, float l, float u) {
   return max(min(v, u), l);
--
1.8.3.2

_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list