[Beignet] [PATCH] GBE: make byte/short vload/vstore process one element each time.

Zhigang Gong zhigang.gong at linux.intel.com
Mon Mar 24 22:28:46 PDT 2014


LGTM, pushed, thanks.

On Wed, Mar 19, 2014 at 11:41:54AM +0800, Ruiling Song wrote:
> Per OCL Spec, the computed address (p+offset*n) is 8-bit aligned for char,
> and 16-bit aligned for short in vloadn & vstoren. That is we can not assume that
> vload4 with char pointer is 4byte aligned. The previous implementation will make
> Clang generate an load or store with alignment 4 which is in fact only alignment 1.
> 
> We need find another way to optimize the vloadn.
> But before that, let's keep vloadn and vstoren work correctly.
> This could fix the regression issue caused by byte/short optimization.
> 
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
>  backend/src/ocl_stdlib.tmpl.h |   60 ++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 56 insertions(+), 4 deletions(-)
> 
> diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
> index e3ac632..25f2ff7 100755
> --- a/backend/src/ocl_stdlib.tmpl.h
> +++ b/backend/src/ocl_stdlib.tmpl.h
> @@ -3882,10 +3882,59 @@ INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
>    DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
>    DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
>  
> -DECL_UNTYPED_RW_ALL(char)
> -DECL_UNTYPED_RW_ALL(uchar)
> -DECL_UNTYPED_RW_ALL(short)
> -DECL_UNTYPED_RW_ALL(ushort)
> +#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
> +INLINE_OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \
> +  return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \
> +} \
> +INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
> +  return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \
> +} \
> +INLINE_OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \
> +  return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \
> +} \
> +INLINE_OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \
> +  return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \
> +} \
> +INLINE_OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \
> +  return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \
> +}
> +
> +#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
> +INLINE_OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\
> +  *(p + 2 * offset) = v.s0; \
> +  *(p + 2 * offset + 1) = v.s1; \
> +} \
> +INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
> +  *(p + 3 * offset) = v.s0; \
> +  *(p + 3 * offset + 1) = v.s1; \
> +  *(p + 3 * offset + 2) = v.s2; \
> +} \
> +INLINE_OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \
> +  vstore2(v.lo, 2*offset, p); \
> +  vstore2(v.hi, 2*offset, p+2); \
> +} \
> +INLINE_OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \
> +  vstore4(v.lo, 2*offset, p); \
> +  vstore4(v.hi, 2*offset, p+4); \
> +} \
> +INLINE_OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
> +  vstore8(v.lo, 2*offset, p); \
> +  vstore8(v.hi, 2*offset, p+8); \
> +}
> +
> +#define DECL_BYTE_RW_ALL(TYPE) \
> +  DECL_BYTE_RD_SPACE(TYPE, __global) \
> +  DECL_BYTE_RD_SPACE(TYPE, __local) \
> +  DECL_BYTE_RD_SPACE(TYPE, __private) \
> +  DECL_BYTE_RD_SPACE(TYPE, __constant) \
> +  DECL_BYTE_WR_SPACE(TYPE, __global) \
> +  DECL_BYTE_WR_SPACE(TYPE, __local) \
> +  DECL_BYTE_WR_SPACE(TYPE, __private)
> +
> +DECL_BYTE_RW_ALL(char)
> +DECL_BYTE_RW_ALL(uchar)
> +DECL_BYTE_RW_ALL(short)
> +DECL_BYTE_RW_ALL(ushort)
>  DECL_UNTYPED_RW_ALL(int)
>  DECL_UNTYPED_RW_ALL(uint)
>  DECL_UNTYPED_RW_ALL(long)
> @@ -3900,6 +3949,9 @@ DECL_UNTYPED_RW_ALL(double)
>  #undef DECL_UNTYPED_RD_SPACE_N
>  #undef DECL_UNTYPED_V3_SPACE
>  #undef DECL_UNTYPED_RDV3_SPACE
> +#undef DECL_BYTE_RD_SPACE
> +#undef DECL_BYTE_WR_SPACE
> +#undef DECL_BYTE_RW_ALL
>  
>  PURE CONST float __gen_ocl_f16to32(short h);
>  PURE CONST short __gen_ocl_f32to16(float f);
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list