[Mesa-dev] [PATCH] radeonsi: don't use CLEAR_STATE on SI

Nicolai Hähnle nhaehnle at gmail.com
Fri Aug 18 11:11:20 UTC 2017


On 15.08.2017 18:21, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
> 
> This fixes random hangs with Unigine Valley.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102201

One comment below. With that addressed:

Fixes: 064550238ef0 ("radeonsi: use CLEAR_STATE to initialize some 
registers")
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>


> ---
>   src/gallium/drivers/radeonsi/si_hw_context.c | 20 ++++++++-----
>   src/gallium/drivers/radeonsi/si_pipe.c       |  4 +++
>   src/gallium/drivers/radeonsi/si_pipe.h       |  1 +
>   src/gallium/drivers/radeonsi/si_state.c      | 45 ++++++++++++++++++++++++++--
>   4 files changed, 60 insertions(+), 10 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
> index 3582cd7..46e6073 100644
> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
> @@ -225,41 +225,47 @@ void si_begin_new_cs(struct si_context *ctx)
>   	if (ctx->queued.named.gs)
>   		ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
>   	if (ctx->queued.named.vs)
>   		ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
>   	if (ctx->queued.named.ps)
>   		ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
>   	if (ctx->vertex_buffers.buffer)
>   		ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
>   
>   	/* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
> -	ctx->framebuffer.dirty_cbufs =
> -		u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
> -	/* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
> -	ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
> +	bool has_clear_state = ctx->screen->has_clear_state;
> +	if (has_clear_state) {
> +		ctx->framebuffer.dirty_cbufs =
> +			 u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
> +		/* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
> +		ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
> +	} else {
> +		ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
> +		ctx->framebuffer.dirty_zsbuf = true;
> +	}
>   	/* This should always be marked as dirty to set the framebuffer scissor
>   	 * at least. */
>   	si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
>   
>   	si_mark_atom_dirty(ctx, &ctx->clip_regs);
>   	/* CLEAR_STATE sets zeros. */
> -	if (ctx->clip_state.any_nonzeros)
> +	if (!has_clear_state || ctx->clip_state.any_nonzeros)
>   		si_mark_atom_dirty(ctx, &ctx->clip_state.atom);
>   	ctx->msaa_sample_locs.nr_samples = 0;
>   	si_mark_atom_dirty(ctx, &ctx->msaa_sample_locs.atom);
>   	si_mark_atom_dirty(ctx, &ctx->msaa_config);
>   	/* CLEAR_STATE sets 0xffff. */
> -	if (ctx->sample_mask.sample_mask != 0xffff)
> +	if (!has_clear_state || ctx->sample_mask.sample_mask != 0xffff)
>   		si_mark_atom_dirty(ctx, &ctx->sample_mask.atom);
>   	si_mark_atom_dirty(ctx, &ctx->cb_render_state);
>   	/* CLEAR_STATE sets zeros. */
> -	if (ctx->blend_color.any_nonzeros)
> +	if (!has_clear_state || ctx->blend_color.any_nonzeros)
>   		si_mark_atom_dirty(ctx, &ctx->blend_color.atom);
>   	si_mark_atom_dirty(ctx, &ctx->db_render_state);
>   	si_mark_atom_dirty(ctx, &ctx->stencil_ref.atom);
>   	si_mark_atom_dirty(ctx, &ctx->spi_map);
>   	si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
>   	si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
>   	si_all_descriptors_begin_new_cs(ctx);
>   	si_all_resident_buffers_begin_new_cs(ctx);
>   
>   	ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index cac1d01..80a77a8 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -1060,20 +1060,24 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
>   
>   	if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
>   		si_init_perfcounters(sscreen);
>   
>   	/* Hawaii has a bug with offchip buffers > 256 that can be worked
>   	 * around by setting 4K granularity.
>   	 */
>   	sscreen->tess_offchip_block_dw_size =
>   		sscreen->b.family == CHIP_HAWAII ? 4096 : 8192;
>   
> +	/* The mere presense of CLEAR_STATE in the IB causes random GPU hangs
> +	 * on SI. */
> +	sscreen->has_clear_state = sscreen->b.chip_class >= CIK;
> +
>   	sscreen->has_distributed_tess =
>   		sscreen->b.chip_class >= VI &&
>   		sscreen->b.info.max_se >= 2;
>   
>   	sscreen->has_draw_indirect_multi =
>   		(sscreen->b.family >= CHIP_POLARIS10) ||
>   		(sscreen->b.chip_class == VI &&
>   		 sscreen->b.info.pfp_fw_version >= 121 &&
>   		 sscreen->b.info.me_fw_version >= 87) ||
>   		(sscreen->b.chip_class == CIK &&
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index ee0ab1b..8d82287 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -80,20 +80,21 @@
>   #define SIX_BITS		0x3F
>   
>   struct si_compute;
>   struct hash_table;
>   struct u_suballocator;
>   
>   struct si_screen {
>   	struct r600_common_screen	b;
>   	unsigned			gs_table_depth;
>   	unsigned			tess_offchip_block_dw_size;
> +	bool				has_clear_state;
>   	bool				has_distributed_tess;
>   	bool				has_draw_indirect_multi;
>   	bool				has_ds_bpermute;
>   	bool				has_msaa_sample_loc_bug;
>   	bool				llvm_has_working_vgpr_indexing;
>   
>   	/* Whether shaders are monolithic (1-part) or separate (3-part). */
>   	bool				use_monolithic_shaders;
>   	bool				record_llvm_ir;
>   
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index 2c413a4..cd18d72 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -4417,50 +4417,66 @@ si_write_harvested_raster_configs(struct si_context *sctx,
>   	}
>   }
>   
>   static void si_init_config(struct si_context *sctx)
>   {
>   	struct si_screen *sscreen = sctx->screen;
>   	unsigned num_rb = MIN2(sctx->screen->b.info.num_render_backends, 16);
>   	unsigned rb_mask = sctx->screen->b.info.enabled_rb_mask;
>   	unsigned raster_config, raster_config_1;
>   	uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
> +	bool has_clear_state = sscreen->has_clear_state;
>   	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);

Please add

    assert(has_clear_state || sscreen->b.chip_class == SI);

as living documentation that the has_clear_state settings are 
insufficient for later chip classes.

Cheers,
Nicolai


>   
>   	if (!pm4)
>   		return;
>   
>   	si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
>   	si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1));
>   	si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1));
>   	si_pm4_cmd_end(pm4, false);
>   
> -	si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE);
> -	si_pm4_cmd_add(pm4, 0);
> -	si_pm4_cmd_end(pm4, false);
> +	if (has_clear_state) {
> +		si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE);
> +		si_pm4_cmd_add(pm4, 0);
> +		si_pm4_cmd_end(pm4, false);
> +	}
>   
>   	si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
> +	if (!has_clear_state)
> +		si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
>   
>   	/* FIXME calculate these values somehow ??? */
>   	if (sctx->b.chip_class <= VI) {
>   		si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
>   		si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
>   	}
>   
> +	if (!has_clear_state) {
> +		si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
> +		si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
> +		si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
> +	}
> +
>   	si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
> +	if (!has_clear_state)
> +		si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
>   	if (sctx->b.chip_class < CIK)
>   		si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
>   			       S_008A14_CLIP_VTX_REORDER_ENA(1));
>   
>   	si_pm4_set_reg(pm4, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0x76543210);
>   	si_pm4_set_reg(pm4, R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0xfedcba98);
>   
> +	if (!has_clear_state)
> +		si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0);
> +
>   	switch (sctx->screen->b.family) {
>   	case CHIP_TAHITI:
>   	case CHIP_PITCAIRN:
>   		raster_config = 0x2a00126a;
>   		raster_config_1 = 0x00000000;
>   		break;
>   	case CHIP_VERDE:
>   		raster_config = 0x0000124a;
>   		raster_config_1 = 0x00000000;
>   		break;
> @@ -4557,20 +4573,40 @@ static void si_init_config(struct si_context *sctx)
>   		si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
>   		si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
>   		si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
>   		si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
>   			       S_028244_BR_X(16384) | S_028244_BR_Y(16384));
>   		si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
>   		si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR,
>   			       S_028034_BR_X(16384) | S_028034_BR_Y(16384));
>   	}
>   
> +	if (!has_clear_state) {
> +		si_pm4_set_reg(pm4, R_02820C_PA_SC_CLIPRECT_RULE, 0xFFFF);
> +		si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
> +			       S_028230_ER_TRI(0xA) |
> +			       S_028230_ER_POINT(0xA) |
> +			       S_028230_ER_RECT(0xA) |
> +			       /* Required by DX10_DIAMOND_TEST_ENA: */
> +			       S_028230_ER_LINE_LR(0x1A) |
> +			       S_028230_ER_LINE_RL(0x26) |
> +			       S_028230_ER_LINE_TB(0xA) |
> +			       S_028230_ER_LINE_BT(0xA));
> +		/* PA_SU_HARDWARE_SCREEN_OFFSET must be 0 due to hw bug on SI */
> +		si_pm4_set_reg(pm4, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0);
> +		si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0);
> +		si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
> +		si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
> +		si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
> +		si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
> +	}
> +
>   	if (sctx->b.chip_class >= GFX9) {
>   		si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
>   		si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
>   		si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
>   	} else {
>   		/* These registers, when written, also overwrite the CLEAR_STATE
>   		 * context, so we can't rely on CLEAR_STATE setting them.
>   		 * It would be an issue if there was another UMD changing them.
>   		 */
>   		si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
> @@ -4633,20 +4669,23 @@ static void si_init_config(struct si_context *sctx)
>   			S_028B50_DONUT_SPLIT(16);
>   
>   		/* Testing with Unigine Heaven extreme tesselation yielded best results
>   		 * with TRAP_SPLIT = 3.
>   		 */
>   		if (sctx->b.family == CHIP_FIJI ||
>   		    sctx->b.family >= CHIP_POLARIS10)
>   			vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3);
>   
>   		si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution);
> +	} else if (!has_clear_state) {
> +		si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
> +		si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
>   	}
>   
>   	si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
>   	if (sctx->b.chip_class >= CIK)
>   		si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, border_color_va >> 40);
>   	si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ,
>   		      RADEON_PRIO_BORDER_COLORS);
>   
>   	if (sctx->b.chip_class >= GFX9) {
>   		unsigned num_se = sscreen->b.info.max_se;
> 


-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.


More information about the mesa-dev mailing list