[Beignet] [PATCH 1/2] GBE: Optimize read_image performance for CL_ADDRESS_CLAMP..

Thu Apr 10 21:58:35 PDT 2014

On Fri, Apr 11, 2014 at 03:42:48AM +0000, Yang, Rong R wrote:
> Two comments.
> 
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Zhigang Gong
> Sent: Thursday, April 10, 2014 12:41 PM
> To: beignet at lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: [Beignet] [PATCH 1/2] GBE: Optimize read_image performance for CL_ADDRESS_CLAMP..
> 
> The previous work around(due to hardware restriction.) is to use CL_ADDRESS_CLAMP_TO_EDGE to implement CL_ADDRESS_CLAMP which is not very efficient, especially for the boundary checking overhead.
> The root cause is that we need to check each pixel's coordinate.
> 
> Now we change to use the LD message to implement CL_ADDRESS_CLAMP. For integer coordinates, we don't need to do the boundary checking. And for the float coordinates, we only need to check whether it's less than zero which is much simpler than before.
> 
> This patch could bring about 20% to 30% performance gain for luxmark's medium and simple scene.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
>  backend/src/backend/gen_context.cpp        |  2 +-
>  backend/src/backend/gen_defs.hpp           |  4 +-
>  backend/src/backend/gen_encoder.cpp        |  7 +--
>  backend/src/backend/gen_encoder.hpp        |  3 +-
>  backend/src/backend/gen_insn_selection.cpp | 32 +++++++++----  backend/src/backend/gen_insn_selection.hpp |  1 +
>  backend/src/llvm/llvm_gen_backend.cpp      | 29 +++++++++++-
>  backend/src/llvm/llvm_gen_ocl_function.hxx |  8 +++-
>  backend/src/llvm/llvm_scalarize.cpp        |  9 +++-
>  backend/src/ocl_stdlib.tmpl.h              | 72 +++++++++++++++++++++---------
>  src/intel/intel_driver.c                   |  2 +-
>  src/intel/intel_gpgpu.c                    | 15 +------
>  12 files changed, 129 insertions(+), 55 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
> index 50f10c5..ea673b6 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -1848,7 +1848,7 @@ namespace gbe
>      const unsigned char sampler = insn.extra.sampler;
>      const unsigned int msgLen = insn.extra.rdmsglen;
>      uint32_t simdWidth = p->curr.execWidth;
> -    p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0);
> +    p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, 
> + -1, 0, insn.extra.isLD);
>    }
>  
>    void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) { diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
> index e731174..f24d924 100644
> --- a/backend/src/backend/gen_defs.hpp
> +++ b/backend/src/backend/gen_defs.hpp
> @@ -370,8 +370,8 @@ enum GenMessageTarget {
>  #define GEN_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
>  #define GEN_SAMPLER_MESSAGE_SIMD16_RESINFO            2
>  #define GEN_SAMPLER_MESSAGE_SIMD4X2_LD                3
> -#define GEN_SAMPLER_MESSAGE_SIMD8_LD                  3
> -#define GEN_SAMPLER_MESSAGE_SIMD16_LD                 3
> +#define GEN_SAMPLER_MESSAGE_SIMD8_LD                  7
> +#define GEN_SAMPLER_MESSAGE_SIMD16_LD                 7
>  
>  #define GEN5_SAMPLER_MESSAGE_SAMPLE              0
>  #define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS         1
> diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
> index 9df031e..ce9be09 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -1264,11 +1264,12 @@ namespace gbe
>                            unsigned char sampler,
>                            uint32_t simdWidth,
>                            uint32_t writemask,
> -                          uint32_t return_format)
> +                          uint32_t return_format,
> +                          bool isLD)
>    {
>       if (writemask == 0) return;
> -     uint32_t msg_type =  (simdWidth == 16) ?
> -                            GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
> +     uint32_t msg_type = isLD ? GEN_SAMPLER_MESSAGE_SIMD8_LD :
> +                                GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
>       uint32_t response_length = (4 * (simdWidth / 8));
>       uint32_t msg_length = (msg_len * (simdWidth / 8));
>       if (header_present)
> diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
> index 50662fb..321c8c1 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -185,7 +185,8 @@ namespace gbe
>                  unsigned char sampler,
>                  unsigned int simdWidth,
>                  uint32_t writemask,
> -                uint32_t return_format);
> +                uint32_t return_format,
> +                bool isLD);
>  
>      /*! TypedWrite instruction for texture */
>      void TYPED_WRITE(GenRegister header, diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index 961f3af..fea0329 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -559,7 +559,7 @@ namespace gbe
>      /*! Encode ternary instructions */
>      void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
>      /*! Encode sample instructions */
> -    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D);
> +    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister 
> + *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool 
> + isLD);
>      /*! Encode typed write instructions */
>      void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
>      /*! Get image information */
> @@ -1500,7 +1500,7 @@ namespace gbe
>  
>    void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
>                                   GenRegister *msgPayloads, uint32_t msgNum,
> -                                 uint32_t bti, uint32_t sampler, bool is3D) {
> +                                 uint32_t bti, uint32_t sampler, bool 
> + isLD) {
>      SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum);
>      SelectionVector *dstVector = this->appendVector();
>      SelectionVector *msgVector = this->appendVector(); @@ -1524,6 +1524,7 @@ namespace gbe
>      insn->extra.rdbti = bti;
>      insn->extra.sampler = sampler;
>      insn->extra.rdmsglen = msgNum;
> +    insn->extra.isLD = isLD;
>    }
>  
>    ///////////////////////////////////////////////////////////////////////////
> @@ -3161,21 +3162,36 @@ namespace gbe
>        GenRegister dst[insn.getDstNum()];
>        uint32_t srcNum = insn.getSrcNum();
>        uint32_t valueID = 0;
> +      uint32_t msgLen = 0;
>  
>        for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
>          dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
>  
>        if (!insn.is3D())
>          srcNum--;
> -      /* U, V, [W] */
> -      for (valueID = 0; valueID < srcNum; ++valueID)
> -        msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
>  
> +      if (insn.getSamplerOffset() != 0) {
> +        // U, lod, V, [W]
> +        GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT);
> +        msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType());
> +        msgPayloads[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
> +        msgPayloads[2] = sel.selReg(insn.getSrc(1), insn.getSrcType());
> +        if (srcNum > 2)
> +          msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType());
> +        // Clear the lod to zero.
> +        sel.MOV(msgPayloads[1], GenRegister::immud(0));
> +        msgLen = srcNum + 1;
> +      } else {
> +        // U, V, [W]
> +        GBE_ASSERT(insn.getSrcType() == TYPE_FLOAT);
> +        for (valueID = 0; valueID < srcNum; ++valueID)
> +          msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
> +        msgLen = srcNum;
> +      }
>        uint32_t bti = insn.getImageIndex();
> -      /* We have the clamp border workaround. */
> -      uint32_t sampler = insn.getSamplerIndex() + insn.getSamplerOffset() * 8;
> +      uint32_t sampler = insn.getSamplerIndex();
>  
> -      sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, srcNum, bti, sampler, insn.is3D());
> +      sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, msgLen, bti, 
> + sampler, insn.getSamplerOffset());
>        return true;
>      }
>      DECL_CTOR(SampleInstruction, 1, 1); diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
> index 85974f0..ad8c4ec 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -123,6 +123,7 @@ namespace gbe
>          uint16_t rdbti:8;
>          uint16_t sampler:5;
>          uint16_t rdmsglen:3;
> +        bool     isLD;  // is this a ld message?
>        };
>        uint32_t barrierType;
>        bool longjmp;
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index 5a2ba16..b46e991 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2210,6 +2210,12 @@ namespace gbe
>        case GEN_OCL_READ_IMAGE_I_3D:
>        case GEN_OCL_READ_IMAGE_UI_3D:
>        case GEN_OCL_READ_IMAGE_F_3D:
> +      case GEN_OCL_READ_IMAGE_I_I:
> +      case GEN_OCL_READ_IMAGE_UI_I:
> +      case GEN_OCL_READ_IMAGE_F_I:
> +      case GEN_OCL_READ_IMAGE_I_3D_I:
> +      case GEN_OCL_READ_IMAGE_UI_3D_I:
> +      case GEN_OCL_READ_IMAGE_F_3D_I:
>        {
>          // dst is a 4 elements vector. We allocate all 4 registers here.
>          uint32_t elemNum;
> @@ -2480,6 +2486,12 @@ namespace gbe
>            case GEN_OCL_READ_IMAGE_I_3D:
>            case GEN_OCL_READ_IMAGE_UI_3D:
>            case GEN_OCL_READ_IMAGE_F_3D:
> +          case GEN_OCL_READ_IMAGE_I_I:
> +          case GEN_OCL_READ_IMAGE_UI_I:
> +          case GEN_OCL_READ_IMAGE_F_I:
> +          case GEN_OCL_READ_IMAGE_I_3D_I:
> +          case GEN_OCL_READ_IMAGE_UI_3D_I:
> +          case GEN_OCL_READ_IMAGE_F_3D_I:
>            {
>              GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
>              const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
> @@ -2491,7 +2503,12 @@ namespace gbe
>              GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
>              ir::Register wcoord;
>              bool is3D = false;
> -            if (it->second >= GEN_OCL_READ_IMAGE_I_3D) {
> +            if (it->second == GEN_OCL_READ_IMAGE_I_3D    ||
> +                it->second == GEN_OCL_READ_IMAGE_UI_3D   ||
> +                it->second == GEN_OCL_READ_IMAGE_F_3D    ||
> +                it->second == GEN_OCL_READ_IMAGE_I_3D_I  ||
> +                it->second == GEN_OCL_READ_IMAGE_UI_3D_I ||
> +                it->second == GEN_OCL_READ_IMAGE_F_3D_I) {
>                GBE_ASSERT(AI != AE); wcoord = this->getRegister(*AI); ++AI;
>                is3D = true;
>              } else
> @@ -2524,18 +2541,26 @@ namespace gbe
>                case GEN_OCL_READ_IMAGE_UI:
>                case GEN_OCL_READ_IMAGE_I_3D:
>                case GEN_OCL_READ_IMAGE_UI_3D:
> +              case GEN_OCL_READ_IMAGE_I_I:
> +              case GEN_OCL_READ_IMAGE_UI_I:
> +              case GEN_OCL_READ_IMAGE_I_3D_I:
> +              case GEN_OCL_READ_IMAGE_UI_3D_I:
>                  dstType = ir::TYPE_U32;
>                  break;
>                case GEN_OCL_READ_IMAGE_F:
>                case GEN_OCL_READ_IMAGE_F_3D:
> +              case GEN_OCL_READ_IMAGE_F_I:
> +              case GEN_OCL_READ_IMAGE_F_3D_I:
>                  dstType = ir::TYPE_FLOAT;
>                  break;
>                default:
>                  GBE_ASSERT(0); // never been here.
>              }
>  
> +            bool isFloatCoord = it->second <= GEN_OCL_READ_IMAGE_F_3D;
> +
>              ctx.SAMPLE(surfaceID, dstTuple, srcTuple, dstType == ir::TYPE_FLOAT,
> -                       true, sampler, samplerOffset, is3D);
> +                       isFloatCoord, sampler, samplerOffset, is3D);
>              break;
>            }
>            case GEN_OCL_WRITE_IMAGE_I:
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 5bf794a..7058a60 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -49,10 +49,16 @@ DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I, _Z21__gen_ocl_read_imageijtffj)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI, _Z22__gen_ocl_read_imageuijtffj)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F, _Z21__gen_ocl_read_imagefjtffj)
> -
>  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D, _Z21__gen_ocl_read_imageijtfffj)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D, _Z22__gen_ocl_read_imageuijtfffj)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D, _Z21__gen_ocl_read_imagefjtfffj)
> +// work around read image with the LD message. The coords are integer type.
> +DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_I, _Z21__gen_ocl_read_imageijtiij) 
> +DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_I, 
> +_Z22__gen_ocl_read_imageuijtiij) DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_I, 
> +_Z21__gen_ocl_read_imagefjtiij) 
> +DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D_I, 
> +_Z21__gen_ocl_read_imageijtiiij) 
> +DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D_I, 
> +_Z22__gen_ocl_read_imageuijtiiij) 
> +DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D_I, 
> +_Z21__gen_ocl_read_imagefjtiiij)
>  
>  // To write_image functions.
>  DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I, _Z22__gen_ocl_write_imageijiiDv4_i)
> diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
> index 7095473..911be30 100644
> --- a/backend/src/llvm/llvm_scalarize.cpp
> +++ b/backend/src/llvm/llvm_scalarize.cpp
> @@ -1,4 +1,4 @@
> -/*
> +;/*
>   * Copyright © 2012 Intel Corporation
>   *
>   * This library is free software; you can redistribute it and/or @@ -649,6 +649,12 @@ namespace gbe {
>            case GEN_OCL_READ_IMAGE_I_3D:
>            case GEN_OCL_READ_IMAGE_UI_3D:
>            case GEN_OCL_READ_IMAGE_F_3D:
> +          case GEN_OCL_READ_IMAGE_I_I:
> +          case GEN_OCL_READ_IMAGE_UI_I:
> +          case GEN_OCL_READ_IMAGE_F_I:
> +          case GEN_OCL_READ_IMAGE_I_3D_I:
> +          case GEN_OCL_READ_IMAGE_UI_3D_I:
> +          case GEN_OCL_READ_IMAGE_F_3D_I:
>            case GEN_OCL_GET_IMAGE_WIDTH:
>            case GEN_OCL_GET_IMAGE_HEIGHT:
>            {
> @@ -797,7 +803,6 @@ namespace gbe {
>      for (SmallVectorImpl<PHINode*>::iterator phiI = incompletePhis.begin(), phiE = incompletePhis.end();
>         phiI != phiE; ++phiI) {
>        assert(canGetComponentArgs(*phiI) && "Phi's operands never scalarized");
> -
>        // Fill in each component of this phi
>        VectorValues& vVals = vectorVals[*phiI];
>        for (int c = 0; c < GetComponentCount(*phiI); ++c) { diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h index 50107d8..b7dc607 100755
> --- a/backend/src/ocl_stdlib.tmpl.h
> +++ b/backend/src/ocl_stdlib.tmpl.h
> @@ -4538,12 +4538,18 @@ int __gen_ocl_force_simd16(void);  /////////////////////////////////////////////////////////////////////////////
>  
>  OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
> +OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t 
> +sampler, int u, int v, uint sampler_offset);
>  OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
> +OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t 
> +sampler, int u, int v, uint sampler_offset);
>  OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
> +OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t 
> +sampler, int u, int v, uint sampler_offset);
>  
>  OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
> +OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t 
> +sampler, int u, int v, int w, uint sampler_offset);
>  OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
> +OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t 
> +sampler, int u, int v, int w, uint sampler_offset);
>  OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
> +OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t 
> +sampler, int u, int v, int w, uint sampler_offset);
>  
>  OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);  OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color); @@ -4567,8 +4573,27 @@ int __gen_ocl_get_image_depth(uint surface_id);  #define GEN_FIX_1 0  #endif
>  
> -#define DECL_READ_IMAGE(float_coord_rounding_fix, int_clamping_fix,          \
> -                        image_type, type, suffix, coord_type)                \
> +#define DECL_READ_IMAGE0(float_coord_rounding_fix, int_clamping_fix,          \
> +                        image_type, type, suffix, coord_type, n)             \
> +  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
> +                                               const sampler_t sampler,      \
> +                                               coord_type coord)             \
> +  {                                                                          \
> +    GET_IMAGE(cl_image, surface_id);                                         \
> +    if (float_coord_rounding_fix | int_clamping_fix) {                       \
> +      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)              \
> +          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {        \
> +        if (int_clamping_fix)                                                \
> +            return   __gen_ocl_read_image ##suffix(                          \
> +                        EXPEND_READ_COORD(surface_id, sampler, coord), 1);\
> +      }                                                                      \
> +    }                                                                        \
> +    return  __gen_ocl_read_image ##suffix(                                   \
> +                        EXPEND_READ_COORD(surface_id, sampler, 
> +(float)coord), 0);\
> +  }
> +
> >>>>>>>>>>>>> float_coord_rounding_fix is useless in DECL_READ_IMAGE0. In fact, the only difference of two return is the last parameter. So why not use a var to avoid the if blocks.

Good idea, after this patch, the macro could be simplfied a little bit.

> 
> 
> 
> +#define DECL_READ_IMAGE1(float_coord_rounding_fix, int_clamping_fix,          \
> +                        image_type, type, suffix, coord_type, n)             \
>    INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
>                                                 const sampler_t sampler,      \
>                                                 coord_type coord)             \
> @@ -4576,25 +4601,20 @@ int __gen_ocl_get_image_depth(uint surface_id);
>      GET_IMAGE(cl_image, surface_id);                                         \
>      coord_type tmpCoord = coord;                                             \
>      if (float_coord_rounding_fix | int_clamping_fix) {                       \
> -      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)         \
> -          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {   \
> +      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)              \
> +          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {        \
>          if (float_coord_rounding_fix                                         \
> -            && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) {         \
> +            && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) {              \
>            FIXUP_FLOAT_COORD(tmpCoord);                                       \
>          }                                                                    \
>          if (int_clamping_fix) {                                              \
> -           if (OUT_OF_BOX(tmpCoord, surface_id,                              \
> -                          (sampler & CLK_NORMALIZED_COORDS_TRUE))) {    \
> -            unsigned int border_alpha;                                       \
> -            int order = __gen_ocl_get_image_channel_order(surface_id);       \
> -            if (!CLK_HAS_ALPHA(order)) {                                     \
> -              border_alpha = 1;                                              \
> +            coord_type intCoord;                                             \
> +            if (sampler & CLK_NORMALIZED_COORDS_TRUE) {                      \
> +              DENORMALIZE_COORD(surface_id, intCoord, tmpCoord);             \
>              } else                                                           \
> -              border_alpha = 0;                                              \
> -              return (type)(0, 0, 0, border_alpha);                          \
> -          } else                                                             \
> +              intCoord = tmpCoord;                                           \
>              return   __gen_ocl_read_image ##suffix(                          \
> -                        EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 1);\
> +                        EXPEND_READ_COORD1(surface_id, sampler, 
> + intCoord), 1);\
>         }                                                                     \
>        }                                                                      \
>      }                                                                        \
> >>>>>>Now only float coordinate use DECL_READ_IMAGE1, why still need int_clamping_fix here?
The int clamping fix is for the integer surface type, not for the coordinate.
We need this fix for both float/integer coordinate, if the surface is integer
type.