[Mesa-dev] [PATCH] ac/nir: Add workaround for GFX9 buffer views.

Thu Mar 15 08:07:28 UTC 2018

On 03/09/2018 12:38 AM, Bas Nieuwenhuizen wrote:
> On GFX9 whether the buffer size is interpreted as elements or bytes
> depends on whether IDXEN is enabled in the instruction. If the index
> is a constant zero, LLVM optimizes IDXEN to 0.
> 
> Now the size in elements is interpreted in bytes which of course
> results in out of bounds accesses.
> 
> The correct fix is most likely to disable the LLVM optimization,
> but we need something to work with LLVM <= 6.0.
> 
> radeonsi does the max between stride and element count on the CPU
> but that results in the size intrinsics returning the wrong size
> for the buffer. This would cause CTS errors for radv.
> 
> v2: Also include the store changes.
> 
> Fixes: e38685cc62e 'Revert "radv: disable support for VEGA for now."'
> ---
>   src/amd/common/ac_llvm_build.c  | 24 ++++++++++++++++++++++++
>   src/amd/common/ac_llvm_build.h  | 10 ++++++++++
>   src/amd/common/ac_nir_to_llvm.c | 39 ++++++++++++++++++++++++++++++++-------
>   src/amd/common/ac_shader_abi.h  |  4 ++++
>   4 files changed, 70 insertions(+), 7 deletions(-)
> 
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index 9851cafb7f..0fe39a97cb 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -1082,6 +1082,30 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
>   					   can_speculate, true);
>   }
>   
> +LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
> +						   LLVMValueRef rsrc,
> +						   LLVMValueRef vindex,
> +						   LLVMValueRef voffset,
> +						   unsigned num_channels,
> +						   bool glc,
> +						   bool can_speculate)
> +{
> +	LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
> +	LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 1, 0), "");
> +	stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
> +
> +	LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
> +	                                              LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
> +	                                              elem_count, stride, "");
> +
> +	LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
> +						       LLVMConstInt(ctx->i32, 2, 0), "");
> +
> +	return ac_build_buffer_load_common(ctx, new_rsrc, vindex, voffset,
> +					   num_channels, glc, false,
> +					   can_speculate, true);
> +}
> +
>   /**
>    * Set range metadata on an instruction.  This can only be used on load and
>    * call instructions.  If you know an instruction can only produce the values
> diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
> index c080381d21..e469668f08 100644
> --- a/src/amd/common/ac_llvm_build.h
> +++ b/src/amd/common/ac_llvm_build.h
> @@ -242,6 +242,16 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
>   					 bool glc,
>   					 bool can_speculate);
>   
> +/* load_format that handles the stride & element count better if idxen is
> + * disabled by LLVM. */
> +LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
> +						   LLVMValueRef rsrc,
> +						   LLVMValueRef vindex,
> +						   LLVMValueRef voffset,
> +						   unsigned num_channels,
> +						   bool glc,
> +						   bool can_speculate);
> +
>   LLVMValueRef
>   ac_get_thread_id(struct ac_llvm_context *ctx);
>   
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index 9b85069860..8ec5002c47 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -2263,12 +2263,21 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
>   	if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
>   		unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
>   
> -		return ac_build_buffer_load_format(&ctx->ac,
> -						   args->resource,
> -						   args->addr,
> -						   ctx->ac.i32_0,
> -						   util_last_bit(mask),
> -						   false, true);
> +		if (ctx->abi->gfx9_stride_size_workaround) {
> +			return ac_build_buffer_load_format_gfx9_safe(&ctx->ac,
> +								     args->resource,
> +								     args->addr,
> +								     ctx->ac.i32_0,
> +								     util_last_bit(mask),
> +								     false, true);
> +		} else {
> +			return ac_build_buffer_load_format(&ctx->ac,
> +							   args->resource,
> +							   args->addr,
> +							   ctx->ac.i32_0,
> +							   util_last_bit(mask),
> +							   false, true);
> +		}
>   	}
>   
>   	args->opcode = ac_image_sample;
> @@ -3657,8 +3666,23 @@ static void visit_image_store(struct ac_nir_context *ctx,
>   		glc = ctx->ac.i1true;
>   
>   	if (dim == GLSL_SAMPLER_DIM_BUF) {
> +		LLVMValueRef rsrc = get_sampler_desc(ctx, instr->variables[0], AC_DESC_BUFFER, NULL, true, true);
> +
> +		if (ctx->abi->gfx9_stride_size_workaround) {
> +			LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), "");
> +			LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), "");
> +			stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, 0), "");
> +
> +			LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->ac.builder,
> +			                                              LLVMBuildICmp(ctx->ac.builder, LLVMIntUGT, elem_count, stride, ""),
> +			                                              elem_count, stride, "");
> +
> +			rsrc = LLVMBuildInsertElement(ctx->ac.builder, rsrc, new_elem_count,
> +			                              LLVMConstInt(ctx->ac.i32, 2, 0), "");

I think it would be better to have a function that returns the new 
resource and use it here and in the load case.

Did you check that DOW3 is fixed with this?

> +		}
> +
>   		params[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[2])); /* data */
> -		params[1] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_BUFFER, NULL, true, true);
> +		params[1] = rsrc;
>   		params[2] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]),
>   						    ctx->ac.i32_0, ""); /* vindex */
>   		params[3] = ctx->ac.i32_0; /* voffset */
> @@ -6915,6 +6939,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>   	ctx.abi.load_sampler_desc = radv_get_sampler_desc;
>   	ctx.abi.load_resource = radv_load_resource;
>   	ctx.abi.clamp_shadow_reference = false;
> +	ctx.abi.gfx9_stride_size_workaround = ctx.ac.chip_class == GFX9;
>   
>   	if (shader_count >= 2)
>   		ac_init_exec_full_mask(&ctx.ac);
> diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
> index 901e49b1f9..49ded57180 100644
> --- a/src/amd/common/ac_shader_abi.h
> +++ b/src/amd/common/ac_shader_abi.h
> @@ -189,6 +189,10 @@ struct ac_shader_abi {
>   	/* Whether to clamp the shadow reference value to [0,1]on VI. Radeonsi currently
>   	 * uses it due to promoting D16 to D32, but radv needs it off. */
>   	bool clamp_shadow_reference;
> +
> +	/* Whether to workaround GFX9 ignoring the stride for the buffer size if IDXEN=0
> +	 * and LLVM optimizes an indexed load with constant index to IDXEN=0. */
> +	bool gfx9_stride_size_workaround;
>   };
>   
>   #endif /* AC_SHADER_ABI_H */
>