[Mesa-dev] [PATCH 11/19] radeonsi: don't re-create shader PM4 states after scratch buffer update

Sun Oct 2 21:09:26 UTC 2016

From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeonsi/si_pm4.c           |  9 +++++++-
 src/gallium/drivers/radeonsi/si_pm4.h           |  1 +
 src/gallium/drivers/radeonsi/si_state_shaders.c | 30 +++++++++++++------------
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c
index c3032fc..386d093 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -96,25 +96,32 @@ void si_pm4_add_bo(struct si_pm4_state *state,
 		   enum radeon_bo_priority priority)
 {
 	unsigned idx = state->nbo++;
 	assert(idx < SI_PM4_MAX_BO);
 
 	r600_resource_reference(&state->bo[idx], bo);
 	state->bo_usage[idx] = usage;
 	state->bo_priority[idx] = priority;
 }
 
-void si_pm4_free_state_simple(struct si_pm4_state *state)
+void si_pm4_clear_state(struct si_pm4_state *state)
 {
 	for (int i = 0; i < state->nbo; ++i)
 		r600_resource_reference(&state->bo[i], NULL);
 	r600_resource_reference(&state->indirect_buffer, NULL);
+	state->nbo = 0;
+	state->ndw = 0;
+}
+
+void si_pm4_free_state_simple(struct si_pm4_state *state)
+{
+	si_pm4_clear_state(state);
 	FREE(state);
 }
 
 void si_pm4_free_state(struct si_context *sctx,
 		       struct si_pm4_state *state,
 		       unsigned idx)
 {
 	if (!state)
 		return;
 
diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h
index 35fa6c3..9b02a80 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@@ -64,20 +64,21 @@ void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw);
 void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate);
 
 void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val);
 void si_pm4_add_bo(struct si_pm4_state *state,
 		   struct r600_resource *bo,
 		   enum radeon_bo_usage usage,
 		   enum radeon_bo_priority priority);
 void si_pm4_upload_indirect_buffer(struct si_context *sctx,
 				   struct si_pm4_state *state);
 
+void si_pm4_clear_state(struct si_pm4_state *state);
 void si_pm4_free_state_simple(struct si_pm4_state *state);
 void si_pm4_free_state(struct si_context *sctx,
 		       struct si_pm4_state *state,
 		       unsigned idx);
 
 void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state);
 void si_pm4_emit_dirty(struct si_context *sctx);
 void si_pm4_reset_emitted(struct si_context *sctx);
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 816aadc..acbceba 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -322,27 +322,37 @@ static void si_set_tesseval_regs(struct si_screen *sscreen,
 	} else
 		distribution_mode = V_028B6C_DISTRIBUTION_MODE_NO_DIST;
 
 	si_pm4_set_reg(pm4, R_028B6C_VGT_TF_PARAM,
 		       S_028B6C_TYPE(type) |
 		       S_028B6C_PARTITIONING(partitioning) |
 		       S_028B6C_TOPOLOGY(topology) |
 		       S_028B6C_DISTRIBUTION_MODE(distribution_mode));
 }
 
+static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader)
+{
+	if (shader->pm4)
+		si_pm4_clear_state(shader->pm4);
+	else
+		shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+
+	return shader->pm4;
+}
+
 static void si_shader_ls(struct si_shader *shader)
 {
 	struct si_pm4_state *pm4;
 	unsigned vgpr_comp_cnt;
 	uint64_t va;
 
-	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+	pm4 = si_get_shader_pm4_state(shader);
 	if (!pm4)
 		return;
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 
 	/* We need at least 2 components for LS.
 	 * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
 	vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1;
 
@@ -356,21 +366,21 @@ static void si_shader_ls(struct si_shader *shader)
 			   S_00B528_FLOAT_MODE(shader->config.float_mode);
 	shader->config.rsrc2 = S_00B52C_USER_SGPR(SI_LS_NUM_USER_SGPR) |
 			   S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
 }
 
 static void si_shader_hs(struct si_shader *shader)
 {
 	struct si_pm4_state *pm4;
 	uint64_t va;
 
-	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
+	pm4 = si_get_shader_pm4_state(shader);
 	if (!pm4)
 		return;
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 
 	si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
 	si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
 		       S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) |
@@ -384,22 +394,21 @@ static void si_shader_hs(struct si_shader *shader)
 }
 
 static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
 {
 	struct si_pm4_state *pm4;
 	unsigned num_user_sgprs;
 	unsigned vgpr_comp_cnt;
 	uint64_t va;
 	unsigned oc_lds_en;
 
-	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
-
+	pm4 = si_get_shader_pm4_state(shader);
 	if (!pm4)
 		return;
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 
 	if (shader->selector->type == PIPE_SHADER_VERTEX) {
 		vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0;
 		num_user_sgprs = SI_ES_NUM_USER_SGPR;
 	} else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
@@ -460,22 +469,21 @@ static void si_shader_gs(struct si_shader *shader)
 	unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size;
 	unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2;
 	unsigned gs_num_invocations = shader->selector->gs_num_invocations;
 	struct si_pm4_state *pm4;
 	uint64_t va;
 	unsigned max_stream = shader->selector->max_gs_stream;
 
 	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
 	assert(gsvs_itemsize < (1 << 15));
 
-	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
-
+	pm4 = si_get_shader_pm4_state(shader);
 	if (!pm4)
 		return;
 
 	si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(shader));
 
 	si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
 	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
 	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
 
 	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
@@ -518,22 +526,21 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
 {
 	struct si_pm4_state *pm4;
 	unsigned num_user_sgprs;
 	unsigned nparams, vgpr_comp_cnt;
 	uint64_t va;
 	unsigned oc_lds_en;
 	unsigned window_space =
 	   shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
 	bool enable_prim_id = si_vs_exports_prim_id(shader);
 
-	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
-
+	pm4 = si_get_shader_pm4_state(shader);
 	if (!pm4)
 		return;
 
 	/* We always write VGT_GS_MODE in the VS state, because every switch
 	 * between different shader pipelines involving a different GS or no
 	 * GS at all involves a switch of the VS (different GS use different
 	 * copy shaders). On the other hand, when the API switches from a GS to
 	 * no GS and then back to the same GS used originally, the GS state is
 	 * not sent again.
 	 */
@@ -682,22 +689,21 @@ static void si_shader_ps(struct si_shader *shader)
 	/* we need to enable at least one of them, otherwise we hang the GPU */
 	assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
 	       G_0286CC_PERSP_CENTER_ENA(input_ena) ||
 	       G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
 	       G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
 	       G_0286CC_LINEAR_SAMPLE_ENA(input_ena) ||
 	       G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
 	       G_0286CC_LINEAR_CENTROID_ENA(input_ena) ||
 	       G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
 
-	pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
-
+	pm4 = si_get_shader_pm4_state(shader);
 	if (!pm4)
 		return;
 
 	/* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
 	 * Possible vaules:
 	 * 0 -> Position = pixel center
 	 * 1 -> Position = pixel centroid
 	 * 2 -> Position = at sample position
 	 *
 	 * From GLSL 4.5 specification, section 7.1:
@@ -784,24 +790,20 @@ static void si_shader_ps(struct si_shader *shader)
 	else if (info->num_memory_instructions >= 2 ||
 	         shader->binary.code_size > 100*4)
 		shader->z_order = V_02880C_EARLY_Z_THEN_RE_Z;
 	else
 		shader->z_order = V_02880C_EARLY_Z_THEN_LATE_Z;
 }
 
 static void si_shader_init_pm4_state(struct si_screen *sscreen,
                                      struct si_shader *shader)
 {
-
-	if (shader->pm4)
-		si_pm4_free_state_simple(shader->pm4);
-
 	switch (shader->selector->type) {
 	case PIPE_SHADER_VERTEX:
 		if (shader->key.vs.as_ls)
 			si_shader_ls(shader);
 		else if (shader->key.vs.as_es)
 			si_shader_es(sscreen, shader);
 		else
 			si_shader_vs(sscreen, shader, NULL);
 		break;
 	case PIPE_SHADER_TESS_CTRL:
-- 
2.7.4