[Mesa-dev] [PATCH] radv: Implement alternate GFX9 scissor workaround.

Mon May 28 09:39:25 UTC 2018

Nice find!

Though, we can now remove the checks in 
radv_CmdSetScissor/radv_CmdSetViewport because they were added to reduce 
the number of flushes.

With that, patch is:

Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>

On 05/27/2018 06:57 PM, Bas Nieuwenhuizen wrote:
> This improves dota2 performance for me by 11% when I force the
> GPU DPM level to low (otherwise dota2 is CPU limited for 4k on my
> threadripper), which should be a large part of the radv-amdvlk gap.
> (For me with that was radv 60.3 -> 66.6, while AMDVLK does about 68
> fps)
> 
> It looks like dota2 rendered the GUI with a bunch of draws with
> a SetScissors before almost each draw, causing a lot of pipeline
> stalls.
> 
> I'm not really happy with the duplication of code, but overriding
> radeon_set_context_reg would also be messy since we have the
> pre-recorded pipelines and a bunch of si_cmd_buffer code, as well
> as some memory->context reg loads for which things would be more
> complicated.
> ---
>   src/amd/vulkan/radv_cmd_buffer.c | 56 +++++++++++++++++++++++++++-----
>   1 file changed, 47 insertions(+), 9 deletions(-)
> 
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
> index 5ab577b4c59..08be50995c1 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -869,14 +869,6 @@ radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
>   {
>   	uint32_t count = cmd_buffer->state.dynamic.scissor.count;
>   
> -	/* Vega10/Raven scissor bug workaround. This must be done before VPORT
> -	 * scissor registers are changed. There is also a more efficient but
> -	 * more involved alternative workaround.
> -	 */
> -	if (cmd_buffer->device->physical_device->has_scissor_bug) {
> -		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
> -		si_emit_cache_flush(cmd_buffer);
> -	}
>   	si_write_scissors(cmd_buffer->cs, 0, count,
>   			  cmd_buffer->state.dynamic.scissor.scissors,
>   			  cmd_buffer->state.dynamic.viewport.viewports,
> @@ -1403,7 +1395,8 @@ radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
>   	if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
>   		radv_emit_viewport(cmd_buffer);
>   
> -	if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
> +	if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
> +	    !cmd_buffer->device->physical_device->has_scissor_bug)
>   		radv_emit_scissor(cmd_buffer);
>   
>   	if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
> @@ -3144,10 +3137,52 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
>   	}
>   }
>   
> +/*
> + * Vega and raven have a bug which triggers if there are multiple context
> + * register contexts active at the same time with different scissor values.
> + *
> + * There are two possible workarounds:
> + * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
> + *    there is only ever 1 active set of scissor values at the same time.
> + *
> + * 2) Whenever the hardware switches contexts we have to set the scissor
> + *    registers again even if it is a noop. That way the new context gets
> + *    the correct scissor values.
> + *
> + * This implements option 2. radv_need_late_scissor_emission needs to
> + * return true on affected HW if radv_emit_all_graphics_states sets
> + * any context registers.
> + */
> +static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
> +                                            bool indexed_draw)
> +{
> +	struct radv_cmd_state *state = &cmd_buffer->state;
> +
> +	if (!cmd_buffer->device->physical_device->has_scissor_bug)
> +		return false;
> +
> +	/* Assume all state changes except  these two can imply context rolls. */
> +	if (cmd_buffer->state.dirty & ~(RADV_CMD_DIRTY_INDEX_BUFFER |
> +	                                RADV_CMD_DIRTY_VERTEX_BUFFER |
> +	                                RADV_CMD_DIRTY_PIPELINE))
> +		return true;
> +
> +	if (cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
> +		return true;
> +
> +	if (indexed_draw && state->pipeline->graphics.prim_restart_enable &&
> +	    (state->index_type ? 0xffffffffu : 0xffffu) != state->last_primitive_reset_index)
> +		return true;
> +
> +	return false;
> +}
> +
>   static void
>   radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
>   			      const struct radv_draw_info *info)
>   {
> +	bool late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info->indexed);
> +
>   	if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
>   	    cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
>   		radv_emit_rbplus_state(cmd_buffer);
> @@ -3177,6 +3212,9 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
>   	radv_emit_draw_registers(cmd_buffer, info->indexed,
>   				 info->instance_count > 1, info->indirect,
>   				 info->indirect ? 0 : info->count);
> +
> +	if (late_scissor_emission)
> +		radv_emit_scissor(cmd_buffer);
>   }
>   
>   static void
>