[Mesa-dev] [PATCH] r600g: Workaround for a nested loop bug on Cayman

Sun Apr 14 16:09:35 PDT 2013

On 04/15/2013 01:05 AM, Martin Andersson wrote:
> There is a bug where a BREAK/CONTINUE followed by LOOP_STARTxxx for nested
> loops may put the branch stack into a state such that ALU_PUSH_BEFORE
> doesn't work as expected. Workaround this by replacing the ALU_PUSH_BEFORE
> with a PUSH + ALU for nested loops.
>
> Fixes piglit tests:
> spec/!OpenGL 1.1/read-front
> spec/EXT_transform_feedback/order*
> spec/glsl-1.40/uniform_buffer/fs-struct-pad
>
> No piglit regressions.
> ---
>   src/gallium/drivers/r600/r600_shader.c | 33 ++++++++++++++++++++++++++++++---
>   1 file changed, 30 insertions(+), 3 deletions(-)
>
> diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
> index 6dbca50..aee011e 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -252,6 +252,7 @@ static int tgsi_endif(struct r600_shader_ctx *ctx);
>   static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
>   static int tgsi_endloop(struct r600_shader_ctx *ctx);
>   static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
> +static bool need_cayman_loop_bug_workaround(struct r600_shader_ctx *ctx);
>
>   /*
>    * bytestream -> r600 shader
> @@ -5490,7 +5491,7 @@ static int tgsi_opdst(struct r600_shader_ctx *ctx)
>   	return 0;
>   }
>
> -static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
> +static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
>   {
>   	struct r600_bytecode_alu alu;
>   	int r;
> @@ -5510,7 +5511,7 @@ static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
>
>   	alu.last = 1;
>
> -	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
> +	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
>   	if (r)
>   		return r;
>   	return 0;
> @@ -5730,7 +5731,20 @@ static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
>
>   static int tgsi_if(struct r600_shader_ctx *ctx)
>   {
> -	emit_logic_pred(ctx, ALU_OP2_PRED_SETNE_INT);
> +	int alu_type = CF_OP_ALU_PUSH_BEFORE;
> +
> +	/*
> +	   There is a bug where a BREAK/CONTINUE followed by LOOP_STARTxxx for nested
> +	   loops may put the branch stack into a state such that ALU_PUSH_BEFORE
> +	   doesn't work as expected. Workaround this by replacing the ALU_PUSH_BEFORE
> +	   with a PUSH + ALU for nested loops.
> +	 */
> +	if (ctx->bc->chip_class == CAYMAN && need_cayman_loop_bug_workaround(ctx)) {

We already have current loop level for the stack size computation, see 
r600_bytecode::stack, so I think need_cayman_loop_bug_workaround call 
may be replaced with "ctx->bc->stack.loop > 1", if I'm not missing 
something.

Vadim

> +		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
> +		alu_type = CF_OP_ALU;
> +	}
> +
> +	emit_logic_pred(ctx, ALU_OP2_PRED_SETNE_INT, alu_type);
>
>   	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
>
> @@ -5834,6 +5848,19 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
>   	return 0;
>   }
>
> +static bool need_cayman_loop_bug_workaround(struct r600_shader_ctx *ctx)
> +{
> +	unsigned int fscp;
> +	int num_loops = 0;
> +	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
> +	{
> +		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
> +			++num_loops;
> +	}
> +
> +	return num_loops >= 2;
> +}
> +
>   static int tgsi_umad(struct r600_shader_ctx *ctx)
>   {
>   	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
>