[Beignet] [PATCH] HSW: Fix potential issue of GT3 when calc stack address.

Wed Jun 11 23:02:21 PDT 2014

Tested on my HSW platform, 
no obvious regression found.

On Thu, 2014-06-12 at 19:42 +0800, Yang Rong wrote:
> GT3 have 4 half slice, so should shift left 2 bits, and also should enlarge the stack buffer size,
> otherwize, if thread generate is non-balance, may out of bound.
> Per bspec, scratch size need set 2X of desired.
> 
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
>  backend/src/backend/gen75_context.cpp | 4 ++--
>  src/cl_command_queue_gen7.c           | 6 ++++++
>  src/intel/intel_gpgpu.c               | 3 +++
>  3 files changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
> index aedd4d3..da0db85 100644
> --- a/backend/src/backend/gen75_context.cpp
> +++ b/backend/src/backend/gen75_context.cpp
> @@ -92,12 +92,12 @@ namespace gbe
>        p->curr.predicate = GEN_PREDICATE_NONE;
>        //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
>        p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
> -      p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x80));
> +      p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
>        p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
>        p->curr.execWidth = this->simdWidth;
>        p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
>        p->curr.execWidth = 1;
> -      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(1));
> +      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
>        p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
>        p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
>        p->curr.execWidth = this->simdWidth;
> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> index 9680535..af3030c 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -244,6 +244,12 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
>    assert(offset >= 0);
>    stack_sz *= gbe_kernel_get_simd_width(ker->opaque);
>    stack_sz *= device->max_compute_unit;
> +  /* Because HSW calc stack offset per thread is relative with half slice, when
> +     thread schedule in half slice is not balance, would out of bound. Because
> +     the max half slice is 4 in GT4, multiply stack size with 4 for safe.
> +   */
> +  if(cl_driver_get_ver(ctx->drv) == 75)
> +    stack_sz *= 4;
>    cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl());
>  }
>  
> diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
> index 5093583..cae843b 100644
> --- a/src/intel/intel_gpgpu.c
> +++ b/src/intel/intel_gpgpu.c
> @@ -833,6 +833,9 @@ intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
>    drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
>    drm_intel_bo* old = gpgpu->scratch_b.bo;
>    uint32_t total = per_thread_size * gpgpu->max_threads;
> +  /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang */
> +  if (IS_HASWELL(gpgpu->drv->device_id))
> +      total *= 2;
>  
>    gpgpu->per_thread_scratch = per_thread_size;
>