[Mesa-dev] [PATCH 2/2] radeonsi: disable primitive restart for non-strip prims based on app list

Nicolai Hähnle nhaehnle at gmail.com
Mon Apr 24 15:34:28 UTC 2017


On 24.04.2017 15:22, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>

I don't like it. This kind of app-specific override is what drirc was 
meant to provide. Having separate places for it is confusing.

Cheers,
Nicolai

>
> ---
>  src/gallium/drivers/radeonsi/si_pipe.c       | 20 +++++++++++++
>  src/gallium/drivers/radeonsi/si_pipe.h       |  1 +
>  src/gallium/drivers/radeonsi/si_state_draw.c | 45 ++++++++++++++++++++--------
>  3 files changed, 54 insertions(+), 12 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index 1a83564..53a8201 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -29,20 +29,29 @@
>  #include "radeon/radeon_uvd.h"
>  #include "util/u_memory.h"
>  #include "util/u_suballoc.h"
>  #include "util/u_tests.h"
>  #include "vl/vl_decoder.h"
>  #include "../ddebug/dd_util.h"
>
>  #define SI_LLVM_DEFAULT_FEATURES \
>  	"+DumpCode,+vgpr-spilling,-fp32-denormals,-xnack"
>
> +/* DX10/11 apply primitive restart to strip primitive types only. */
> +static const char *apps_with_prim_restart_dx_behavior[] = {
> +	"DeusExMD",
> +	"DirtRally",
> +	"HitmanPro",
> +	"MadMax",
> +	"TotalWarhammer",
> +};
> +
>  /*
>   * pipe_context
>   */
>  static void si_destroy_context(struct pipe_context *context)
>  {
>  	struct si_context *sctx = (struct si_context *)context;
>  	int i;
>
>  	/* Unreference the framebuffer normally to disable related logic
>  	 * properly.
> @@ -306,20 +315,31 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
>  	 *
>  	 * The recommended value is 4 per CU at most. Higher numbers don't
>  	 * bring much benefit, but they still occupy chip resources (think
>  	 * async compute). I've seen ~2% performance difference between 4 and 32.
>  	 */
>  	sctx->scratch_waves = MAX2(32 * sscreen->b.info.num_good_compute_units,
>  				   max_threads_per_block / 64);
>
>  	sctx->tm = si_create_llvm_target_machine(sscreen);
>
> +	/* Process the app list. */
> +	char process_name[128];
> +	if (os_get_process_name(process_name, sizeof(process_name))) {
> +		for (i = 0; i < ARRAY_SIZE(apps_with_prim_restart_dx_behavior); i++) {
> +			if (strcmp(process_name, apps_with_prim_restart_dx_behavior[i]) == 0) {
> +				sctx->use_prim_restart_dx_behavior = true;
> +				break;
> +			}
> +		}
> +	}
> +
>  	return &sctx->b.b;
>  fail:
>  	fprintf(stderr, "radeonsi: Failed to create a context.\n");
>  	si_destroy_context(&sctx->b.b);
>  	return NULL;
>  }
>
>  /*
>   * pipe_screen
>   */
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index ea61e1e..1edcfbc 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -236,20 +236,21 @@ struct si_context {
>
>  	struct radeon_winsys_cs		*ce_ib;
>  	struct radeon_winsys_cs		*ce_preamble_ib;
>  	bool				ce_need_synchronization;
>  	struct u_suballocator		*ce_suballocator;
>
>  	struct si_shader_ctx_state	fixed_func_tcs_shader;
>  	LLVMTargetMachineRef		tm; /* only non-threaded compilation */
>  	bool				gfx_flush_in_progress;
>  	bool				compute_is_busy;
> +	bool				use_prim_restart_dx_behavior;
>
>  	/* Atoms (direct states). */
>  	union si_state_atoms		atoms;
>  	unsigned			dirty_atoms; /* mask */
>  	/* PM4 states (precomputed immutable states) */
>  	unsigned			dirty_states;
>  	union si_state			queued;
>  	union si_state			emitted;
>
>  	/* Atom declarations. */
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index e6a9ee0..319160e 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -445,42 +445,43 @@ void si_init_ia_multi_vgt_param_table(struct si_context *sctx)
>  		key.u.tcs_tes_uses_prim_id = tess_uses_primid;
>  		key.u.uses_gs = uses_gs;
>
>  		sctx->ia_multi_vgt_param[key.index] =
>  			si_get_init_multi_vgt_param(sctx->screen, &key);
>  	}
>  }
>
>  static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
>  					  const struct pipe_draw_info *info,
> -					  unsigned num_patches)
> +					  unsigned num_patches,
> +					  bool primitive_restart)
>  {
>  	union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
>  	unsigned primgroup_size;
>  	unsigned ia_multi_vgt_param;
>
>  	if (sctx->tes_shader.cso) {
>  		primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
>  	} else if (sctx->gs_shader.cso) {
>  		primgroup_size = 64; /* recommended with a GS */
>  	} else {
>  		primgroup_size = 128; /* recommended without a GS and tess */
>  	}
>
>  	key.u.prim = info->mode;
>  	key.u.uses_instancing = info->indirect || info->instance_count > 1;
>  	key.u.multi_instances_smaller_than_primgroup =
>  		info->indirect ||
>  		(info->instance_count > 1 &&
>  		 (info->count_from_stream_output ||
>  		  si_num_prims_for_vertices(info) < primgroup_size));
> -	key.u.primitive_restart = info->primitive_restart;
> +	key.u.primitive_restart = primitive_restart;
>  	key.u.count_from_stream_output = info->count_from_stream_output != NULL;
>
>  	ia_multi_vgt_param = sctx->ia_multi_vgt_param[key.index] |
>  			     S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);
>
>  	if (sctx->gs_shader.cso) {
>  		/* GS requirement. */
>  		if (sctx->b.chip_class <= VI &&
>  		    SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
>  			ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
> @@ -544,28 +545,30 @@ static void si_emit_vs_state(struct si_context *sctx,
>  			sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX] +
>  			SI_SGPR_VS_STATE_BITS * 4,
>  			sctx->current_vs_state);
>
>  		sctx->last_vs_state = sctx->current_vs_state;
>  	}
>  }
>
>  static void si_emit_draw_registers(struct si_context *sctx,
>  				   const struct pipe_draw_info *info,
> -				   unsigned num_patches)
> +				   unsigned num_patches,
> +				   bool primitive_restart)
>  {
>  	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>  	unsigned prim = si_conv_pipe_prim(info->mode);
>  	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
>  	unsigned ia_multi_vgt_param;
>
> -	ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);
> +	ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches,
> +						       primitive_restart);
>
>  	/* Draw state. */
>  	if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
>  		if (sctx->b.chip_class >= GFX9)
>  			radeon_set_uconfig_reg_idx(cs, R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
>  		else if (sctx->b.chip_class >= CIK)
>  			radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
>  		else
>  			radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
>
> @@ -579,32 +582,32 @@ static void si_emit_draw_registers(struct si_context *sctx,
>
>  		sctx->last_prim = prim;
>  	}
>
>  	if (gs_out_prim != sctx->last_gs_out_prim) {
>  		radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
>  		sctx->last_gs_out_prim = gs_out_prim;
>  	}
>
>  	/* Primitive restart. */
> -	if (info->primitive_restart != sctx->last_primitive_restart_en) {
> +	if (primitive_restart != sctx->last_primitive_restart_en) {
>  		if (sctx->b.chip_class >= GFX9)
>  			radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
> -					       info->primitive_restart);
> +					       primitive_restart);
>  		else
>  			radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
> -					       info->primitive_restart);
> +					       primitive_restart);
>
> -		sctx->last_primitive_restart_en = info->primitive_restart;
> +		sctx->last_primitive_restart_en = primitive_restart;
>
>  	}
> -	if (info->primitive_restart &&
> +	if (primitive_restart &&
>  	    (info->restart_index != sctx->last_restart_index ||
>  	     sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN)) {
>  		radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
>  				       info->restart_index);
>  		sctx->last_restart_index = info->restart_index;
>  	}
>  }
>
>  static void si_emit_draw_packets(struct si_context *sctx,
>  				 const struct pipe_draw_info *info,
> @@ -1129,29 +1132,47 @@ void si_ce_pre_draw_synchronization(struct si_context *sctx)
>  void si_ce_post_draw_synchronization(struct si_context *sctx)
>  {
>  	if (sctx->ce_need_synchronization) {
>  		radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0));
>  		radeon_emit(sctx->b.gfx.cs, 0);
>
>  		sctx->ce_need_synchronization = false;
>  	}
>  }
>
> +static bool is_strip_primitive_mode(unsigned prim)
> +{
> +	return ((1 << prim) &
> +		((1 << PIPE_PRIM_LINE_STRIP) |
> +		 (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY) |
> +		 (1 << PIPE_PRIM_QUAD_STRIP) |
> +		 (1 << PIPE_PRIM_TRIANGLE_STRIP) |
> +		 (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY))) != 0;
> +}
> +
>  void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>  {
>  	struct si_context *sctx = (struct si_context *)ctx;
>  	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
>  	const struct pipe_index_buffer *ib = &sctx->index_buffer;
>  	struct pipe_index_buffer ib_tmp; /* for index buffer uploads only */
>  	unsigned mask, dirty_tex_counter;
>  	enum pipe_prim_type rast_prim;
>  	unsigned num_patches = 0;
> +	bool primitive_restart = info->indexed && info->primitive_restart;
> +
> +	/* This is better for performance, but the difference might not be
> +	 * measurable.
> +	 */
> +	if (sctx->use_prim_restart_dx_behavior &&
> +	    !is_strip_primitive_mode(info->mode))
> +		primitive_restart = false;
>
>  	if (likely(!info->indirect)) {
>  		/* SI-CI treat instance_count==0 as instance_count==1. There is
>  		 * no workaround for indirect draws, but we can at least skip
>  		 * direct draws.
>  		 */
>  		if (unlikely(!info->instance_count))
>  			return;
>
>  		/* Handle count == 0. */
> @@ -1207,21 +1228,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>  	if (sctx->gs_shader.cso) {
>  		/* Determine whether the GS triangle strip adjacency fix should
>  		 * be applied. Rotate every other triangle if
>  		 * - triangle strips with adjacency are fed to the GS and
>  		 * - primitive restart is disabled (the rotation doesn't help
>  		 *   when the restart occurs after an odd number of triangles).
>  		 */
>  		bool gs_tri_strip_adj_fix =
>  			!sctx->tes_shader.cso &&
>  			info->mode == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY &&
> -			!info->primitive_restart;
> +			!primitive_restart;
>
>  		if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
>  			sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
>  			sctx->do_update_shaders = true;
>  		}
>  	}
>
>  	if (sctx->do_update_shaders && !si_update_shaders(sctx))
>  		return;
>
> @@ -1338,21 +1359,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>
>  		si_pm4_emit(sctx, state);
>  		sctx->emitted.array[i] = state;
>  	}
>  	sctx->dirty_states = 0;
>
>  	si_emit_rasterizer_prim_state(sctx);
>  	if (sctx->tes_shader.cso)
>  		si_emit_derived_tess_state(sctx, info, &num_patches);
>  	si_emit_vs_state(sctx, info);
> -	si_emit_draw_registers(sctx, info, num_patches);
> +	si_emit_draw_registers(sctx, info, num_patches, primitive_restart);
>
>  	si_ce_pre_draw_synchronization(sctx);
>  	si_emit_draw_packets(sctx, info, ib);
>  	si_ce_post_draw_synchronization(sctx);
>
>  	if (sctx->trace_buf)
>  		si_trace_emit(sctx);
>
>  	/* Workaround for a VGT hang when streamout is enabled.
>  	 * It must be done after drawing. */
> @@ -1389,21 +1410,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>  					rtex->dirty_level_mask |= 1 << surf->u.tex.level;
>  				if (rtex->dcc_gather_statistics)
>  					rtex->separate_dcc_dirty = true;
>  			} while (mask);
>  		}
>  		sctx->framebuffer.do_update_surf_dirtiness = false;
>  	}
>
>  	pipe_resource_reference(&ib_tmp.buffer, NULL);
>  	sctx->b.num_draw_calls++;
> -	if (info->primitive_restart)
> +	if (primitive_restart)
>  		sctx->b.num_prim_restart_calls++;
>  	if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size))
>  		sctx->b.num_spill_draw_calls++;
>  }
>
>  void si_trace_emit(struct si_context *sctx)
>  {
>  	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>
>  	sctx->trace_id++;
>


-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.


More information about the mesa-dev mailing list