[Beignet] [PATCH 1/2] GBE: Optimize read_image performance for CL_ADDRESS_CLAMP..

Thu Apr 10 20:42:48 PDT 2014

Two comments.

-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Zhigang Gong
Sent: Thursday, April 10, 2014 12:41 PM
To: beignet at lists.freedesktop.org
Cc: Gong, Zhigang
Subject: [Beignet] [PATCH 1/2] GBE: Optimize read_image performance for CL_ADDRESS_CLAMP..

The previous work around(due to hardware restriction.) is to use CL_ADDRESS_CLAMP_TO_EDGE to implement CL_ADDRESS_CLAMP which is not very efficient, especially for the boundary checking overhead.
The root cause is that we need to check each pixel's coordinate.

Now we change to use the LD message to implement CL_ADDRESS_CLAMP. For integer coordinates, we don't need to do the boundary checking. And for the float coordinates, we only need to check whether it's less than zero which is much simpler than before.

This patch could bring about 20% to 30% performance gain for luxmark's medium and simple scene.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/gen_context.cpp        |  2 +-
 backend/src/backend/gen_defs.hpp           |  4 +-
 backend/src/backend/gen_encoder.cpp        |  7 +--
 backend/src/backend/gen_encoder.hpp        |  3 +-
 backend/src/backend/gen_insn_selection.cpp | 32 +++++++++----  backend/src/backend/gen_insn_selection.hpp |  1 +
 backend/src/llvm/llvm_gen_backend.cpp      | 29 +++++++++++-
 backend/src/llvm/llvm_gen_ocl_function.hxx |  8 +++-
 backend/src/llvm/llvm_scalarize.cpp        |  9 +++-
 backend/src/ocl_stdlib.tmpl.h              | 72 +++++++++++++++++++++---------
 src/intel/intel_driver.c                   |  2 +-
 src/intel/intel_gpgpu.c                    | 15 +------
 12 files changed, 129 insertions(+), 55 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 50f10c5..ea673b6 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1848,7 +1848,7 @@ namespace gbe
     const unsigned char sampler = insn.extra.sampler;
     const unsigned int msgLen = insn.extra.rdmsglen;
     uint32_t simdWidth = p->curr.execWidth;
-    p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0);
+    p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, 
+ -1, 0, insn.extra.isLD);
   }
 
   void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) { diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index e731174..f24d924 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -370,8 +370,8 @@ enum GenMessageTarget {
 #define GEN_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
 #define GEN_SAMPLER_MESSAGE_SIMD16_RESINFO            2
 #define GEN_SAMPLER_MESSAGE_SIMD4X2_LD                3
-#define GEN_SAMPLER_MESSAGE_SIMD8_LD                  3
-#define GEN_SAMPLER_MESSAGE_SIMD16_LD                 3
+#define GEN_SAMPLER_MESSAGE_SIMD8_LD                  7
+#define GEN_SAMPLER_MESSAGE_SIMD16_LD                 7
 
 #define GEN5_SAMPLER_MESSAGE_SAMPLE              0
 #define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS         1
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 9df031e..ce9be09 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1264,11 +1264,12 @@ namespace gbe
                           unsigned char sampler,
                           uint32_t simdWidth,
                           uint32_t writemask,
-                          uint32_t return_format)
+                          uint32_t return_format,
+                          bool isLD)
   {
      if (writemask == 0) return;
-     uint32_t msg_type =  (simdWidth == 16) ?
-                            GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+     uint32_t msg_type = isLD ? GEN_SAMPLER_MESSAGE_SIMD8_LD :
+                                GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
      uint32_t response_length = (4 * (simdWidth / 8));
      uint32_t msg_length = (msg_len * (simdWidth / 8));
      if (header_present)
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 50662fb..321c8c1 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -185,7 +185,8 @@ namespace gbe
                 unsigned char sampler,
                 unsigned int simdWidth,
                 uint32_t writemask,
-                uint32_t return_format);
+                uint32_t return_format,
+                bool isLD);
 
     /*! TypedWrite instruction for texture */
     void TYPED_WRITE(GenRegister header, diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 961f3af..fea0329 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -559,7 +559,7 @@ namespace gbe
     /*! Encode ternary instructions */
     void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
     /*! Encode sample instructions */
-    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D);
+    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister 
+ *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool 
+ isLD);
     /*! Encode typed write instructions */
     void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
     /*! Get image information */
@@ -1500,7 +1500,7 @@ namespace gbe
 
   void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
                                  GenRegister *msgPayloads, uint32_t msgNum,
-                                 uint32_t bti, uint32_t sampler, bool is3D) {
+                                 uint32_t bti, uint32_t sampler, bool 
+ isLD) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum);
     SelectionVector *dstVector = this->appendVector();
     SelectionVector *msgVector = this->appendVector(); @@ -1524,6 +1524,7 @@ namespace gbe
     insn->extra.rdbti = bti;
     insn->extra.sampler = sampler;
     insn->extra.rdmsglen = msgNum;
+    insn->extra.isLD = isLD;
   }
 
   ///////////////////////////////////////////////////////////////////////////
@@ -3161,21 +3162,36 @@ namespace gbe
       GenRegister dst[insn.getDstNum()];
       uint32_t srcNum = insn.getSrcNum();
       uint32_t valueID = 0;
+      uint32_t msgLen = 0;
 
       for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
         dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
 
       if (!insn.is3D())
         srcNum--;
-      /* U, V, [W] */
-      for (valueID = 0; valueID < srcNum; ++valueID)
-        msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
 
+      if (insn.getSamplerOffset() != 0) {
+        // U, lod, V, [W]
+        GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT);
+        msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType());
+        msgPayloads[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        msgPayloads[2] = sel.selReg(insn.getSrc(1), insn.getSrcType());
+        if (srcNum > 2)
+          msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType());
+        // Clear the lod to zero.
+        sel.MOV(msgPayloads[1], GenRegister::immud(0));
+        msgLen = srcNum + 1;
+      } else {
+        // U, V, [W]
+        GBE_ASSERT(insn.getSrcType() == TYPE_FLOAT);
+        for (valueID = 0; valueID < srcNum; ++valueID)
+          msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+        msgLen = srcNum;
+      }
       uint32_t bti = insn.getImageIndex();
-      /* We have the clamp border workaround. */
-      uint32_t sampler = insn.getSamplerIndex() + insn.getSamplerOffset() * 8;
+      uint32_t sampler = insn.getSamplerIndex();
 
-      sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, srcNum, bti, sampler, insn.is3D());
+      sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, msgLen, bti, 
+ sampler, insn.getSamplerOffset());
       return true;
     }
     DECL_CTOR(SampleInstruction, 1, 1); diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 85974f0..ad8c4ec 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -123,6 +123,7 @@ namespace gbe
         uint16_t rdbti:8;
         uint16_t sampler:5;
         uint16_t rdmsglen:3;
+        bool     isLD;  // is this a ld message?
       };
       uint32_t barrierType;
       bool longjmp;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 5a2ba16..b46e991 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2210,6 +2210,12 @@ namespace gbe
       case GEN_OCL_READ_IMAGE_I_3D:
       case GEN_OCL_READ_IMAGE_UI_3D:
       case GEN_OCL_READ_IMAGE_F_3D:
+      case GEN_OCL_READ_IMAGE_I_I:
+      case GEN_OCL_READ_IMAGE_UI_I:
+      case GEN_OCL_READ_IMAGE_F_I:
+      case GEN_OCL_READ_IMAGE_I_3D_I:
+      case GEN_OCL_READ_IMAGE_UI_3D_I:
+      case GEN_OCL_READ_IMAGE_F_3D_I:
       {
         // dst is a 4 elements vector. We allocate all 4 registers here.
         uint32_t elemNum;
@@ -2480,6 +2486,12 @@ namespace gbe
           case GEN_OCL_READ_IMAGE_I_3D:
           case GEN_OCL_READ_IMAGE_UI_3D:
           case GEN_OCL_READ_IMAGE_F_3D:
+          case GEN_OCL_READ_IMAGE_I_I:
+          case GEN_OCL_READ_IMAGE_UI_I:
+          case GEN_OCL_READ_IMAGE_F_I:
+          case GEN_OCL_READ_IMAGE_I_3D_I:
+          case GEN_OCL_READ_IMAGE_UI_3D_I:
+          case GEN_OCL_READ_IMAGE_F_3D_I:
           {
             GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
             const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
@@ -2491,7 +2503,12 @@ namespace gbe
             GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
             ir::Register wcoord;
             bool is3D = false;
-            if (it->second >= GEN_OCL_READ_IMAGE_I_3D) {
+            if (it->second == GEN_OCL_READ_IMAGE_I_3D    ||
+                it->second == GEN_OCL_READ_IMAGE_UI_3D   ||
+                it->second == GEN_OCL_READ_IMAGE_F_3D    ||
+                it->second == GEN_OCL_READ_IMAGE_I_3D_I  ||
+                it->second == GEN_OCL_READ_IMAGE_UI_3D_I ||
+                it->second == GEN_OCL_READ_IMAGE_F_3D_I) {
               GBE_ASSERT(AI != AE); wcoord = this->getRegister(*AI); ++AI;
               is3D = true;
             } else
@@ -2524,18 +2541,26 @@ namespace gbe
               case GEN_OCL_READ_IMAGE_UI:
               case GEN_OCL_READ_IMAGE_I_3D:
               case GEN_OCL_READ_IMAGE_UI_3D:
+              case GEN_OCL_READ_IMAGE_I_I:
+              case GEN_OCL_READ_IMAGE_UI_I:
+              case GEN_OCL_READ_IMAGE_I_3D_I:
+              case GEN_OCL_READ_IMAGE_UI_3D_I:
                 dstType = ir::TYPE_U32;
                 break;
               case GEN_OCL_READ_IMAGE_F:
               case GEN_OCL_READ_IMAGE_F_3D:
+              case GEN_OCL_READ_IMAGE_F_I:
+              case GEN_OCL_READ_IMAGE_F_3D_I:
                 dstType = ir::TYPE_FLOAT;
                 break;
               default:
                 GBE_ASSERT(0); // never been here.
             }
 
+            bool isFloatCoord = it->second <= GEN_OCL_READ_IMAGE_F_3D;
+
             ctx.SAMPLE(surfaceID, dstTuple, srcTuple, dstType == ir::TYPE_FLOAT,
-                       true, sampler, samplerOffset, is3D);
+                       isFloatCoord, sampler, samplerOffset, is3D);
             break;
           }
           case GEN_OCL_WRITE_IMAGE_I:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 5bf794a..7058a60 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -49,10 +49,16 @@ DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I, _Z21__gen_ocl_read_imageijtffj)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI, _Z22__gen_ocl_read_imageuijtffj)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F, _Z21__gen_ocl_read_imagefjtffj)
-
 DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D, _Z21__gen_ocl_read_imageijtfffj)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D, _Z22__gen_ocl_read_imageuijtfffj)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D, _Z21__gen_ocl_read_imagefjtfffj)
+// work around read image with the LD message. The coords are integer type.
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_I, _Z21__gen_ocl_read_imageijtiij) 
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_I, 
+_Z22__gen_ocl_read_imageuijtiij) DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_I, 
+_Z21__gen_ocl_read_imagefjtiij) 
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D_I, 
+_Z21__gen_ocl_read_imageijtiiij) 
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D_I, 
+_Z22__gen_ocl_read_imageuijtiiij) 
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D_I, 
+_Z21__gen_ocl_read_imagefjtiiij)
 
 // To write_image functions.
 DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I, _Z22__gen_ocl_write_imageijiiDv4_i)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 7095473..911be30 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -1,4 +1,4 @@
-/*
+;/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or @@ -649,6 +649,12 @@ namespace gbe {
           case GEN_OCL_READ_IMAGE_I_3D:
           case GEN_OCL_READ_IMAGE_UI_3D:
           case GEN_OCL_READ_IMAGE_F_3D:
+          case GEN_OCL_READ_IMAGE_I_I:
+          case GEN_OCL_READ_IMAGE_UI_I:
+          case GEN_OCL_READ_IMAGE_F_I:
+          case GEN_OCL_READ_IMAGE_I_3D_I:
+          case GEN_OCL_READ_IMAGE_UI_3D_I:
+          case GEN_OCL_READ_IMAGE_F_3D_I:
           case GEN_OCL_GET_IMAGE_WIDTH:
           case GEN_OCL_GET_IMAGE_HEIGHT:
           {
@@ -797,7 +803,6 @@ namespace gbe {
     for (SmallVectorImpl<PHINode*>::iterator phiI = incompletePhis.begin(), phiE = incompletePhis.end();
        phiI != phiE; ++phiI) {
       assert(canGetComponentArgs(*phiI) && "Phi's operands never scalarized");
-
       // Fill in each component of this phi
       VectorValues& vVals = vectorVals[*phiI];
       for (int c = 0; c < GetComponentCount(*phiI); ++c) { diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h index 50107d8..b7dc607 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -4538,12 +4538,18 @@ int __gen_ocl_force_simd16(void);  /////////////////////////////////////////////////////////////////////////////
 
 OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t 
+sampler, int u, int v, uint sampler_offset);
 OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t 
+sampler, int u, int v, uint sampler_offset);
 OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t 
+sampler, int u, int v, uint sampler_offset);
 
 OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t 
+sampler, int u, int v, int w, uint sampler_offset);
 OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t 
+sampler, int u, int v, int w, uint sampler_offset);
 OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t 
+sampler, int u, int v, int w, uint sampler_offset);
 
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);  OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color); @@ -4567,8 +4573,27 @@ int __gen_ocl_get_image_depth(uint surface_id);  #define GEN_FIX_1 0  #endif
 
-#define DECL_READ_IMAGE(float_coord_rounding_fix, int_clamping_fix,          \
-                        image_type, type, suffix, coord_type)                \
+#define DECL_READ_IMAGE0(float_coord_rounding_fix, int_clamping_fix,          \
+                        image_type, type, suffix, coord_type, n)             \
+  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               const sampler_t sampler,      \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    if (float_coord_rounding_fix | int_clamping_fix) {                       \
+      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)              \
+          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {        \
+        if (int_clamping_fix)                                                \
+            return   __gen_ocl_read_image ##suffix(                          \
+                        EXPEND_READ_COORD(surface_id, sampler, coord), 1);\
+      }                                                                      \
+    }                                                                        \
+    return  __gen_ocl_read_image ##suffix(                                   \
+                        EXPEND_READ_COORD(surface_id, sampler, 
+(float)coord), 0);\
+  }
+
>>>>>>>>>>>>> float_coord_rounding_fix is useless in DECL_READ_IMAGE0. In fact, the only difference of two return is the last parameter. So why not use a var to avoid the if blocks.



+#define DECL_READ_IMAGE1(float_coord_rounding_fix, int_clamping_fix,          \
+                        image_type, type, suffix, coord_type, n)             \
   INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
                                                const sampler_t sampler,      \
                                                coord_type coord)             \
@@ -4576,25 +4601,20 @@ int __gen_ocl_get_image_depth(uint surface_id);
     GET_IMAGE(cl_image, surface_id);                                         \
     coord_type tmpCoord = coord;                                             \
     if (float_coord_rounding_fix | int_clamping_fix) {                       \
-      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)         \
-          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {   \
+      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)              \
+          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {        \
         if (float_coord_rounding_fix                                         \
-            && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) {         \
+            && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) {              \
           FIXUP_FLOAT_COORD(tmpCoord);                                       \
         }                                                                    \
         if (int_clamping_fix) {                                              \
-           if (OUT_OF_BOX(tmpCoord, surface_id,                              \
-                          (sampler & CLK_NORMALIZED_COORDS_TRUE))) {    \
-            unsigned int border_alpha;                                       \
-            int order = __gen_ocl_get_image_channel_order(surface_id);       \
-            if (!CLK_HAS_ALPHA(order)) {                                     \
-              border_alpha = 1;                                              \
+            coord_type intCoord;                                             \
+            if (sampler & CLK_NORMALIZED_COORDS_TRUE) {                      \
+              DENORMALIZE_COORD(surface_id, intCoord, tmpCoord);             \
             } else                                                           \
-              border_alpha = 0;                                              \
-              return (type)(0, 0, 0, border_alpha);                          \
-          } else                                                             \
+              intCoord = tmpCoord;                                           \
             return   __gen_ocl_read_image ##suffix(                          \
-                        EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 1);\
+                        EXPEND_READ_COORD1(surface_id, sampler, 
+ intCoord), 1);\
        }                                                                     \
       }                                                                      \
     }                                                                        \
>>>>>>Now only float coordinate use DECL_READ_IMAGE1, why still need int_clamping_fix here?





@@ -4603,7 +4623,7 @@ int __gen_ocl_get_image_depth(uint surface_id);
   }
 
 
-#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type)      \
+#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type, n)   \
   INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
                                                coord_type coord)             \
   {                                                                          \
@@ -4612,7 +4632,7 @@ int __gen_ocl_get_image_depth(uint surface_id);
            EXPEND_READ_COORD(surface_id,                                     \
                              CLK_NORMALIZED_COORDS_FALSE                     \
                              | CLK_ADDRESS_NONE                              \
-                             | CLK_FILTER_NEAREST, coord), 0);               \
+                             | CLK_FILTER_NEAREST, (float)coord), 0);               \
   }
 
 #define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \ @@ -4623,6 +4643,10 @@ int __gen_ocl_get_image_depth(uint surface_id);
   }
 
 #define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1
+#define EXPEND_READ_COORD1(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), \
+                                               (int)(coord.s1 < 0 ? -1 
+: coord.s1) #define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+                                                  dstCoord.y = 
+srcCoord.y * __gen_ocl_get_image_height(id);
 #define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
 
 #define OUT_OF_BOX(coord, surface, normalized)                   \
@@ -4641,9 +4665,9 @@ int __gen_ocl_get_image_depth(uint surface_id);
   }
 
 #define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n)                   \
-  DECL_READ_IMAGE(0, int_clamping_fix, image_type, type, suffix, int ##n)           \
-  DECL_READ_IMAGE(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n) \
-  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n)                      \
+  DECL_READ_IMAGE0(0, int_clamping_fix, image_type, type, suffix, int ##n, n)        \
+  DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n, n) \
+  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n, n)                   \
   DECL_WRITE_IMAGE(image_type, type, suffix, int ## n)                              \
   DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
 
@@ -4652,11 +4676,18 @@ DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2)  DECL_IMAGE(0, image2d_t, float4, f, 2)
 
 #undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORD1
+#undef DENORMALIZE_COORD
 #undef EXPEND_WRITE_COORD
 #undef OUT_OF_BOX
 #undef FIXUP_FLOAT_COORD
 
 #define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2
+#define EXPEND_READ_COORD1(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
+                                               (int)(coord.s1 < 0 ? -1 
+: coord.s1), (int)(coord.s2 < 0 ? -1 : coord.s2) #define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id); \
+                                                  dstCoord.z = 
+srcCoord.z * __gen_ocl_get_image_depth(id);
 #define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color
 #define OUT_OF_BOX(coord, surface, normalized)                  \
   (coord.s0 < 0 || coord.s1 < 0 || coord.s2 < 0 ||              \
@@ -4685,6 +4716,7 @@ DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3)  DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3)  DECL_IMAGE(0, image3d_t, float4, f, 3)  #undef EXPEND_READ_COORD
+#undef DENORMALIZE_COORD
 #undef EXPEND_WRITE_COORD
 #undef OUT_OF_BOX
 #undef FIXUP_FLOAT_COORD
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c index 2a2335b..cce033f 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -135,7 +135,7 @@ intel_driver_memman_init(intel_driver_t *driver)  {
   driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
   assert(driver->bufmgr);
-  //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1); 
+  //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
   drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
 }
 
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 4819e9e..fbeef11 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -861,22 +861,9 @@ static void
 intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)  {
   int index;
-#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
-  //assert(sampler_sz <= GEN_MAX_SAMPLERS/2); -#else
   assert(sampler_sz <= GEN_MAX_SAMPLERS); -#endif
-  for(index = 0; index < sampler_sz; index++) {
+  for(index = 0; index < sampler_sz; index++)
     intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]); -#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
-    /* Duplicate the sampler to 8 + index and fixup the address mode
-     * to repeat.*/
-    if ((samplers[index] & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) {
-      intel_gpgpu_insert_sampler(gpgpu, index + 8,
-                                 (samplers[index] & ~__CLK_ADDRESS_MASK) | CLK_ADDRESS_CLAMP_TO_EDGE);
-    }
-#endif
-  }
 }
 
 static void
--
1.8.3.2

_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet