[Mesa-dev] [PATCH] r600g: avoid shader needing too many gpr to lockup the gpu

Fri Oct 26 19:01:26 PDT 2012

From: Jerome Glisse <jglisse at redhat.com>

On r6xx/r7xx shader resource management need to make sure that the
shader does not goes over the gpr register limit. Each specific
asic has a maxmimum register that can be split btw shader stage.
For each stage the shader must not use more register than the
limit programmed.

Signed-off-by: Jerome Glisse <jglisse at redhat.com>
---
 src/gallium/drivers/r600/r600_pipe.h         |  1 +
 src/gallium/drivers/r600/r600_state.c        | 60 +++++++++++++++++++---------
 src/gallium/drivers/r600/r600_state_common.c | 22 +++++-----
 3 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index ff2a5fd..2045af3 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -363,6 +363,7 @@ struct r600_context {
 	enum chip_class			chip_class;
 	boolean				has_vertex_cache;
 	boolean				keep_tiling_flags;
+	bool				discard_draw;
 	unsigned			default_ps_gprs, default_vs_gprs;
 	unsigned			r6xx_num_clause_temp_gprs;
 	unsigned			backend_mask;
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 7d07008..43af934 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -2189,30 +2189,54 @@ void r600_init_state_functions(struct r600_context *rctx)
 /* Adjust GPR allocation on R6xx/R7xx */
 void r600_adjust_gprs(struct r600_context *rctx)
 {
-	unsigned num_ps_gprs = rctx->default_ps_gprs;
-	unsigned num_vs_gprs = rctx->default_vs_gprs;
+	unsigned num_ps_gprs = rctx->ps_shader->current->shader.bc.ngpr;
+	unsigned num_vs_gprs = rctx->vs_shader->current->shader.bc.ngpr;
+	unsigned new_num_ps_gprs = num_ps_gprs;
+	unsigned new_num_vs_gprs = num_vs_gprs;
+	unsigned cur_num_ps_gprs = G_008C04_NUM_PS_GPRS(rctx->config_state.sq_gpr_resource_mgmt_1);
+	unsigned cur_num_vs_gprs = G_008C04_NUM_VS_GPRS(rctx->config_state.sq_gpr_resource_mgmt_1);
+	unsigned def_num_ps_gprs = rctx->default_ps_gprs;
+	unsigned def_num_vs_gprs = rctx->default_vs_gprs;
+	unsigned def_num_clause_temp_gprs = rctx->r6xx_num_clause_temp_gprs;
+	/* hardware will reserve twice num_clause_temp_gprs */
+	unsigned max_gprs = def_num_ps_gprs + def_num_vs_gprs + def_num_clause_temp_gprs * 2;
 	unsigned tmp;
-	int diff;
 
-	if (rctx->ps_shader->current->shader.bc.ngpr > rctx->default_ps_gprs) {
-		diff = rctx->ps_shader->current->shader.bc.ngpr - rctx->default_ps_gprs;
-		num_vs_gprs -= diff;
-		num_ps_gprs += diff;
+	/* the sum of all SQ_GPR_RESOURCE_MGMT*.NUM_*_GPRS must <= to max_gprs */
+	if (new_num_ps_gprs > cur_num_ps_gprs || new_num_vs_gprs > cur_num_vs_gprs) {
+		/* try to use switch back to default */
+		if (new_num_ps_gprs > def_num_ps_gprs || new_num_vs_gprs > def_num_vs_gprs) {
+			/* always privilege vs stage so that at worst we have the
+			 * pixel stage producing wrong output (not the vertex
+			 * stage) */
+			new_num_ps_gprs = max_gprs - (new_num_vs_gprs + def_num_clause_temp_gprs * 2);
+			new_num_vs_gprs = num_vs_gprs;
+		} else {
+			new_num_ps_gprs = def_num_ps_gprs;
+			new_num_vs_gprs = def_num_vs_gprs;
+		}
+	} else {
+		rctx->discard_draw = false;
+		return;
 	}
 
-	if (rctx->vs_shader->current->shader.bc.ngpr > rctx->default_vs_gprs)
-	{
-		diff = rctx->vs_shader->current->shader.bc.ngpr - rctx->default_vs_gprs;
-		num_ps_gprs -= diff;
-		num_vs_gprs += diff;
+	/* SQ_PGM_RESOURCES_*.NUM_GPRS must always be program to a value <=
+	 * SQ_GPR_RESOURCE_MGMT*.NUM_*_GPRS otherwise the GPU will lockup
+	 * Also if a shader use more gpr than SQ_GPR_RESOURCE_MGMT*.NUM_*_GPRS
+	 * it will lockup. So in this case just discard the draw command
+	 * and don't change the current gprs repartitions.
+	 */
+	rctx->discard_draw = false;
+	if (num_ps_gprs > new_num_ps_gprs || num_vs_gprs > new_num_vs_gprs) {
+		rctx->discard_draw = true;
+		return;
 	}
 
-	tmp = 0;
-	tmp |= S_008C04_NUM_PS_GPRS(num_ps_gprs);
-	tmp |= S_008C04_NUM_VS_GPRS(num_vs_gprs);
-	tmp |= S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs);
-
-	if (tmp != rctx->config_state.sq_gpr_resource_mgmt_1) {
+	/* in some case we endup recomputing the current value */
+	tmp = S_008C04_NUM_PS_GPRS(new_num_ps_gprs) |
+		S_008C04_NUM_VS_GPRS(new_num_vs_gprs) |
+		S_008C04_NUM_CLAUSE_TEMP_GPRS(def_num_clause_temp_gprs);
+	if (rctx->config_state.sq_gpr_resource_mgmt_1 != tmp) {
 		rctx->config_state.sq_gpr_resource_mgmt_1 = tmp;
 		rctx->config_state.atom.dirty = true;
 		rctx->flags |= R600_CONTEXT_PS_PARTIAL_FLUSH;
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 65985c7..5827efb 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -755,10 +755,6 @@ static int r600_shader_select(struct pipe_context *ctx,
 	shader->next_variant = sel->current;
 	sel->current = shader;
 
-	if (rctx->chip_class < EVERGREEN && rctx->ps_shader && rctx->vs_shader) {
-		r600_adjust_gprs(rctx);
-	}
-
 	if (rctx->ps_shader &&
 	    rctx->cb_misc_state.nr_ps_color_outputs != rctx->ps_shader->current->nr_ps_color_outputs) {
 		rctx->cb_misc_state.nr_ps_color_outputs = rctx->ps_shader->current->nr_ps_color_outputs;
@@ -814,9 +810,6 @@ static void r600_bind_ps_state(struct pipe_context *ctx, void *state)
 			rctx->cb_misc_state.multiwrite = multiwrite;
 			rctx->cb_misc_state.atom.dirty = true;
 		}
-
-		if (rctx->vs_shader)
-			r600_adjust_gprs(rctx);
 	}
 
 	if (rctx->cb_misc_state.nr_ps_color_outputs != rctx->ps_shader->current->nr_ps_color_outputs) {
@@ -839,9 +832,6 @@ static void r600_bind_vs_state(struct pipe_context *ctx, void *state)
 	if (state) {
 		r600_context_pipe_state_set(rctx, &rctx->vs_shader->current->rstate);
 
-		if (rctx->chip_class < EVERGREEN && rctx->ps_shader)
-			r600_adjust_gprs(rctx);
-
 		/* Update clip misc state. */
 		if (rctx->vs_shader->current->pa_cl_vs_out_cntl != rctx->clip_misc_state.pa_cl_vs_out_cntl ||
 		    rctx->vs_shader->current->shader.clip_dist_write != rctx->clip_misc_state.clip_dist_write) {
@@ -1055,6 +1045,10 @@ static void r600_update_derived_state(struct r600_context *rctx)
 
 	r600_shader_select(ctx, rctx->ps_shader, &ps_dirty);
 
+	if (rctx->chip_class < EVERGREEN && rctx->ps_shader && rctx->vs_shader) {
+		r600_adjust_gprs(rctx);
+	}
+
 	if (rctx->ps_shader && rctx->rasterizer &&
 	    ((rctx->rasterizer->sprite_coord_enable != rctx->ps_shader->current->sprite_coord_enable) ||
 	     (rctx->rasterizer->flatshade != rctx->ps_shader->current->flatshade))) {
@@ -1139,6 +1133,14 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 
 	r600_update_derived_state(rctx);
 
+	if (rctx->discard_draw) {
+		/* no need to clutter command stream with useless state
+		 * state change. Draw call are discarded if shader is
+		 * over the limit. See r600_adjust_gprs
+		 */
+		return;
+	}
+
 	if (info.indexed) {
 		/* Initialize the index buffer struct. */
 		pipe_resource_reference(&ib.buffer, rctx->index_buffer.buffer);
-- 
1.7.11.7