[Mesa-dev] [PATCH 1/2] radeonsi: enable out-of-order rasterization when possible on VI and GFX9 dGPUs

Marek Olšák maraeo at gmail.com
Wed Sep 6 22:35:38 UTC 2017


From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeonsi/si_pipe.c          |   2 +
 src/gallium/drivers/radeonsi/si_pipe.h          |   1 +
 src/gallium/drivers/radeonsi/si_state.c         | 143 +++++++++++++++++++++++-
 src/gallium/drivers/radeonsi/si_state.h         |  10 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c |   5 +
 5 files changed, 156 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 640b57c..9642edd 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -1041,20 +1041,22 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
 		 sscreen->b.info.pfp_fw_version >= 121 &&
 		 sscreen->b.info.me_fw_version >= 87) ||
 		(sscreen->b.chip_class == CIK &&
 		 sscreen->b.info.pfp_fw_version >= 211 &&
 		 sscreen->b.info.me_fw_version >= 173) ||
 		(sscreen->b.chip_class == SI &&
 		 sscreen->b.info.pfp_fw_version >= 79 &&
 		 sscreen->b.info.me_fw_version >= 142);
 
 	sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;
+	sscreen->has_out_of_order_rast = sscreen->b.chip_class >= VI &&
+					 sscreen->b.info.max_se >= 2;
 	sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 &&
 					    sscreen->b.family <= CHIP_POLARIS12) ||
 					   sscreen->b.family == CHIP_VEGA10 ||
 					   sscreen->b.family == CHIP_RAVEN;
 	sscreen->dpbb_allowed = sscreen->b.chip_class >= GFX9 &&
 				!(sscreen->b.debug_flags & DBG_NO_DPBB);
 	sscreen->dfsm_allowed = sscreen->dpbb_allowed &&
 				!(sscreen->b.debug_flags & DBG_NO_DFSM);
 
 	/* While it would be nice not to have this flag, we are constrained
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 8db7028..b8073ce 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -88,20 +88,21 @@ struct hash_table;
 struct u_suballocator;
 
 struct si_screen {
 	struct r600_common_screen	b;
 	unsigned			gs_table_depth;
 	unsigned			tess_offchip_block_dw_size;
 	bool				has_clear_state;
 	bool				has_distributed_tess;
 	bool				has_draw_indirect_multi;
 	bool				has_ds_bpermute;
+	bool				has_out_of_order_rast;
 	bool				has_msaa_sample_loc_bug;
 	bool				dpbb_allowed;
 	bool				dfsm_allowed;
 	bool				llvm_has_working_vgpr_indexing;
 
 	/* Whether shaders are monolithic (1-part) or separate (3-part). */
 	bool				use_monolithic_shaders;
 	bool				record_llvm_ir;
 
 	mtx_t			shader_parts_mutex;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 7e9140b..855ad27 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -416,20 +416,21 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 	struct si_pm4_state *pm4 = &blend->pm4;
 	uint32_t sx_mrt_blend_opt[8] = {0};
 	uint32_t color_control = 0;
 
 	if (!blend)
 		return NULL;
 
 	blend->alpha_to_coverage = state->alpha_to_coverage;
 	blend->alpha_to_one = state->alpha_to_one;
 	blend->dual_src_blend = util_blend_state_is_dual(state, 0);
+	blend->logicop_enable = state->logicop_enable;
 
 	if (state->logicop_enable) {
 		color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
 	} else {
 		color_control |= S_028808_ROP3(0xcc);
 	}
 
 	si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
 		       S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
 		       S_028B70_ALPHA_TO_MASK_OFFSET0(2) |
@@ -623,20 +624,27 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
 	    old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
 	    old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
 		sctx->do_update_shaders = true;
 
 	if (sctx->screen->dpbb_allowed &&
 	    (!old_blend ||
 	     old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
 	     old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
 	     old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
 		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
+
+	if (sctx->screen->has_out_of_order_rast &&
+	    (!old_blend ||
+	     (old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
+	      old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
+	      old_blend->logicop_enable != blend->logicop_enable)))
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 }
 
 static void si_delete_blend_state(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
 }
 
 static void si_set_blend_color(struct pipe_context *ctx,
 			       const struct pipe_blend_color *state)
@@ -1118,20 +1126,47 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
 
 	dsa->depth_enabled = state->depth.enabled;
 	dsa->depth_write_enabled = state->depth.enabled &&
 				   state->depth.writemask;
 	dsa->stencil_enabled = state->stencil[0].enabled;
 	dsa->stencil_write_enabled = state->stencil[0].enabled &&
 				     (si_dsa_writes_stencil(&state->stencil[0]) ||
 				      si_dsa_writes_stencil(&state->stencil[1]));
 	dsa->db_can_write = dsa->depth_write_enabled ||
 			    dsa->stencil_write_enabled;
+	dsa->stencil_can_run_out_of_order = !dsa->stencil_enabled ||
+					    !dsa->db_can_write;
+
+	bool depth_forces_ordering =
+		dsa->depth_enabled &&
+		(state->depth.func == PIPE_FUNC_NEVER ||
+		 state->depth.func == PIPE_FUNC_LESS ||
+		 state->depth.func == PIPE_FUNC_LEQUAL ||
+		 state->depth.func == PIPE_FUNC_GREATER ||
+		 state->depth.func == PIPE_FUNC_GEQUAL);
+
+	/* Option to allow inexact behavior with out-of-order rasterization
+	 * when two overlapping pixels have the same Z, and depth and color
+	 * writes are enabled, and the depth function is LESS/LEQUAL/GREATER/
+	 * GEQUAL.
+	 */
+	const bool aggressive_opt_out_of_order = false;
+
+	/* Stencil isn't checked here and must be AND'ed with this. */
+	dsa->depth_forces_ordering_color_off =
+		!dsa->depth_enabled ||	     /* no depth tests and writes */
+		!dsa->depth_write_enabled || /* any depth function is OK */
+		depth_forces_ordering;
+	dsa->depth_forces_ordering_color_on =
+		depth_forces_ordering &&
+	        (aggressive_opt_out_of_order || !dsa->depth_write_enabled);
+
 	return dsa;
 }
 
 static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
 {
         struct si_context *sctx = (struct si_context *)ctx;
 	struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
         struct si_state_dsa *dsa = state;
 
         if (!state)
@@ -1147,20 +1182,29 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
 
 	if (!old_dsa || old_dsa->alpha_func != dsa->alpha_func)
 		sctx->do_update_shaders = true;
 
 	if (sctx->screen->dpbb_allowed &&
 	    (!old_dsa ||
 	     (old_dsa->depth_enabled != dsa->depth_enabled ||
 	      old_dsa->stencil_enabled != dsa->stencil_enabled ||
 	      old_dsa->db_can_write != dsa->db_can_write)))
 		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
+
+	if (sctx->screen->has_out_of_order_rast &&
+	    (!old_dsa ||
+	     (old_dsa->depth_enabled != dsa->depth_enabled ||
+	      old_dsa->stencil_enabled != dsa->stencil_enabled ||
+	      old_dsa->stencil_can_run_out_of_order != dsa->stencil_can_run_out_of_order ||
+	      old_dsa->depth_forces_ordering_color_off != dsa->depth_forces_ordering_color_off ||
+	      old_dsa->depth_forces_ordering_color_on != dsa->depth_forces_ordering_color_on)))
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 }
 
 static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state);
 }
 
 static void *si_create_db_flush_dsa(struct si_context *sctx)
 {
@@ -2532,20 +2576,26 @@ static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *sta
 
 static void si_set_framebuffer_state(struct pipe_context *ctx,
 				     const struct pipe_framebuffer_state *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct pipe_constant_buffer constbuf = {0};
 	struct r600_surface *surf = NULL;
 	struct r600_texture *rtex;
 	bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
 	unsigned old_nr_samples = sctx->framebuffer.nr_samples;
+	unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
+	bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
+	bool old_has_stencil =
+		old_has_zsbuf &&
+		((struct r600_texture*)sctx->framebuffer.state.zsbuf)->surface.flags &
+		RADEON_SURF_SBUFFER;
 	bool unbound = false;
 	int i;
 
 	si_update_fb_dirtiness_after_rendering(sctx);
 
 	for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
 		if (!sctx->framebuffer.state.cbufs[i])
 			continue;
 
 		rtex = (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
@@ -2689,44 +2739,52 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
 		p_atomic_inc(&rtex->framebuffers_bound);
 
 		if (rtex->dcc_gather_statistics) {
 			/* Dirty tracking must be enabled for DCC usage analysis. */
 			sctx->framebuffer.compressed_cb_mask |= 1 << i;
 			vi_separate_dcc_start_query(ctx, rtex);
 		}
 	}
 
+	struct r600_texture *zstex = NULL;
+
 	if (state->zsbuf) {
 		surf = (struct r600_surface*)state->zsbuf;
-		rtex = (struct r600_texture*)surf->base.texture;
+		zstex = (struct r600_texture*)surf->base.texture;
 
 		if (!surf->depth_initialized) {
 			si_init_depth_surface(sctx, surf);
 		}
 
-		if (vi_tc_compat_htile_enabled(rtex, surf->base.u.tex.level))
+		if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level))
 			sctx->framebuffer.DB_has_shader_readable_metadata = true;
 
 		r600_context_add_resource_size(ctx, surf->base.texture);
 	}
 
 	si_update_poly_offset_state(sctx);
 	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 	si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
 
 	if (sctx->screen->dpbb_allowed)
 		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
 
 	if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
 		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 
+	if (sctx->screen->has_out_of_order_rast &&
+	    (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
+	     !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
+	     (zstex && !!(zstex->surface.flags & RADEON_SURF_SBUFFER) != old_has_stencil)))
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
+
 	if (sctx->framebuffer.nr_samples != old_nr_samples) {
 		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
 		/* Set sample locations as fragment shader constants. */
 		switch (sctx->framebuffer.nr_samples) {
 		case 1:
 			constbuf.user_buffer = sctx->b.sample_locations_1x;
 			break;
 		case 2:
@@ -3049,30 +3107,111 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx,
 		if (has_msaa_sample_loc_bug &&
 		    sctx->framebuffer.nr_samples > 1 &&
 		    rs && !rs->multisample_enable)
 			small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
 
 		radeon_set_context_reg(cs, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
 				       small_prim_filter_cntl);
 	}
 }
 
+static bool si_out_of_order_rasterization(struct si_context *sctx)
+{
+	struct si_state_blend *blend = sctx->queued.named.blend;
+	struct si_state_dsa *dsa = sctx->queued.named.dsa;
+
+	if (!sctx->screen->has_out_of_order_rast)
+		return false;
+
+	/* PS with memory stores can't run out-of-order. */
+	if (sctx->ps_shader.cso &&
+	    sctx->ps_shader.cso->info.writes_memory)
+		return false;
+
+	unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit &
+			     blend->cb_target_enabled_4bit;
+
+	/* No logic op. */
+	if (colormask && blend->logicop_enable)
+		return false;
+
+	struct r600_texture *zstex =
+		(struct r600_texture*)sctx->framebuffer.state.zsbuf->texture;
+	bool has_stencil = sctx->framebuffer.state.zsbuf &&
+			   zstex->surface.flags & RADEON_SURF_SBUFFER;
+	bool blend_enabled = (colormask & blend->blend_enable_4bit) != 0;
+
+	/* Out-of-order rasterization can be enabled for these cases:
+	 *
+	 * - color-only rendering:
+	 *   + blending must be enabled and commutative
+	 *   + only when inexact behavior due to rounding is allowed
+	 *
+	 * - depth-only rendering:
+	 *   + depth must force ordering
+	 *
+	 * - stencil-only rendering:
+	 *   + never --- can we do better here?
+	 *
+	 * - color rendering with read-only depth:
+	 *   + blending must be disabled
+	 *   + depth must force ordering
+	 *
+	 * - color rendering with read-only stencil:
+	 *   + blending must be disabled
+	 *
+	 * - color+depth rendering:
+	 *   + blending must be disabled
+	 *   + depth must force ordering
+	 *   + only when Z-fighting is allowed to result in inexact behavior
+	 *
+	 * - color+stencil rendering:
+	 *   + never --- can we do better here?
+	 *
+	 * - color+depth+stencil rendering:
+	 *   + never --- can we do better here?
+	 */
+
+	/* If depth and stencil are disabled, blending must be enabled and commutative. */
+	if (!sctx->framebuffer.state.zsbuf ||
+	    (!dsa->depth_enabled && (!has_stencil || !dsa->stencil_enabled))) {
+		if (!colormask)
+			return true; /* Color writes are disabled. */
+
+		/* If color writes are enabled, blending must be enabled and
+		 * commutative.
+		 */
+		return false; /* TODO: check for commutative blending */
+	}
+
+	/* Depth or stencil is enabled. */
+	if (blend_enabled ||
+	    (has_stencil && !dsa->stencil_can_run_out_of_order))
+		return false;
+
+	return colormask ? dsa->depth_forces_ordering_color_on :
+			   dsa->depth_forces_ordering_color_off;
+}
+
 static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned num_tile_pipes = sctx->screen->b.info.num_tile_pipes;
 	/* 33% faster rendering to linear color buffers */
 	bool dst_is_linear = sctx->framebuffer.any_dst_linear;
+	bool out_of_order_rast = si_out_of_order_rasterization(sctx);
 	unsigned sc_mode_cntl_1 =
 		S_028A4C_WALK_SIZE(dst_is_linear) |
 		S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
 		S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
+		S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
+		S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
 		/* always 1: */
 		S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) |
 		S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
 		S_028A4C_TILE_WALK_ORDER_ENABLE(1) |
 		S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
 		S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
 		S_028A4C_FORCE_EOV_REZ_ENABLE(1);
 
 	cayman_emit_msaa_config(cs, sctx->framebuffer.nr_samples,
 				sctx->ps_iter_samples,
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 17d210a..ed66a63 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -42,29 +42,30 @@
 #define SI_NUM_IMAGES			16
 #define SI_NUM_SHADER_BUFFERS		16
 
 struct si_screen;
 struct si_shader;
 struct si_shader_selector;
 
 struct si_state_blend {
 	struct si_pm4_state	pm4;
 	uint32_t		cb_target_mask;
-	bool			alpha_to_coverage;
-	bool			alpha_to_one;
-	bool			dual_src_blend;
 	/* Set 0xf or 0x0 (4 bits) per render target if the following is
 	 * true. ANDed with spi_shader_col_format.
 	 */
 	unsigned		cb_target_enabled_4bit;
 	unsigned		blend_enable_4bit;
 	unsigned		need_src_alpha_4bit;
+	bool			alpha_to_coverage:1;
+	bool			alpha_to_one:1;
+	bool			dual_src_blend:1;
+	bool			logicop_enable:1;
 };
 
 struct si_state_rasterizer {
 	struct si_pm4_state	pm4;
 	/* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
 	struct si_pm4_state	*pm4_poly_offset;
 	unsigned		pa_sc_line_stipple;
 	unsigned		pa_cl_clip_cntl;
 	unsigned		sprite_coord_enable:8;
 	unsigned		clip_plane_enable:8;
@@ -91,20 +92,23 @@ struct si_dsa_stencil_ref_part {
 
 struct si_state_dsa {
 	struct si_pm4_state		pm4;
 	struct si_dsa_stencil_ref_part	stencil_ref;
 	ubyte				alpha_func:3;
 	bool				depth_enabled:1;
 	bool				depth_write_enabled:1;
 	bool				stencil_enabled:1;
 	bool				stencil_write_enabled:1;
 	bool				db_can_write:1;
+	bool				stencil_can_run_out_of_order:1;
+	bool				depth_forces_ordering_color_off:1;
+	bool				depth_forces_ordering_color_on:1;
 };
 
 struct si_stencil_ref {
 	struct r600_atom		atom;
 	struct pipe_stencil_ref		state;
 	struct si_dsa_stencil_ref_part	dsa_part;
 };
 
 struct si_vertex_elements
 {
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index fe25598..23ea1a3 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2405,20 +2405,25 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 	sctx->ps_shader.current = sel ? sel->first_variant : NULL;
 
 	si_update_common_shader_state(sctx);
 	if (sel) {
 		if (sctx->ia_multi_vgt_param_key.u.uses_tess)
 			si_update_tess_uses_prim_id(sctx);
 
 		if (!old_sel ||
 		    old_sel->info.colors_written != sel->info.colors_written)
 			si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+
+		if (sctx->screen->has_out_of_order_rast &&
+		    (!old_sel ||
+		     (old_sel->info.writes_memory != sel->info.writes_memory)))
+			si_mark_atom_dirty(sctx, &sctx->msaa_config);
 	}
 	si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
 	if (shader->is_optimized) {
 		util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
 				    &shader->optimized_ready);
 		util_queue_fence_destroy(&shader->optimized_ready);
-- 
2.7.4



More information about the mesa-dev mailing list