<div dir="ltr">Ping<br></div><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Apr 6, 2018 at 10:31 PM, Marek Olšák <span dir="ltr"><<a href="mailto:maraeo@gmail.com" target="_blank">maraeo@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">From: Marek Olšák <<a href="mailto:marek.olsak@amd.com">marek.olsak@amd.com</a>><br>
<br>
(This patch doesn't enable the behavior. It will be enabled in a later<br>
commit.)<br>
<br>
Draw calls from multiple IBs can be executed in parallel.<br>
<br>
v2: do emit partial flushes on SI<br>
v3: invalidate all shader caches at the beginning of IBs<br>
v4: don't call si_emit_cache_flush in si_flush_gfx_cs if not needed,<br>
only do this for flushes invoked internally<br>
v5: empty IBs should wait for idle if the flush requires it<br>
v6: split the commit<br>
<br>
If we artificially limit the number of draw calls per IB to 5, we'll get<br>
a lot more IBs, leading to a lot more partial flushes. Let's see how<br>
the removal of partial flushes changes GPU utilization in that scenario:<br>
<br>
With partial flushes (time busy):<br>
CP: 99%<br>
SPI: 86%<br>
CB: 73:<br>
<br>
Without partial flushes (time busy):<br>
CP: 99%<br>
SPI: 93%<br>
CB: 81%<br>
---<br>
src/gallium/drivers/radeon/<wbr>radeon_winsys.h | 7 ++++<br>
src/gallium/drivers/radeonsi/<wbr>si_gfx_cs.c | 52 ++++++++++++++++++++++--------<br>
src/gallium/drivers/radeonsi/<wbr>si_pipe.h | 1 +<br>
3 files changed, 46 insertions(+), 14 deletions(-)<br>
<br>
diff --git a/src/gallium/drivers/radeon/<wbr>radeon_winsys.h b/src/gallium/drivers/radeon/<wbr>radeon_winsys.h<br>
index 157b2e40550..fae4fb7a95d 100644<br>
--- a/src/gallium/drivers/radeon/<wbr>radeon_winsys.h<br>
+++ b/src/gallium/drivers/radeon/<wbr>radeon_winsys.h<br>
@@ -21,20 +21,27 @@<br>
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,<br>
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR<br>
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE<br>
* USE OR OTHER DEALINGS IN THE SOFTWARE. */<br>
<br>
#ifndef RADEON_WINSYS_H<br>
#define RADEON_WINSYS_H<br>
<br>
/* The public winsys interface header for the radeon driver. */<br>
<br>
+/* Whether the next IB can start immediately and not wait for draws and<br>
+ * dispatches from the current IB to finish. */<br>
+#define RADEON_FLUSH_START_NEXT_GFX_<wbr>IB_NOW (1u << 31)<br>
+<br>
+#define RADEON_FLUSH_ASYNC_START_NEXT_<wbr>GFX_IB_NOW \<br>
+ (PIPE_FLUSH_ASYNC | RADEON_FLUSH_START_NEXT_GFX_<wbr>IB_NOW)<br>
+<br>
#include "pipebuffer/pb_buffer.h"<br>
<br>
#include "amd/common/ac_gpu_info.h"<br>
#include "amd/common/ac_surface.h"<br>
<br>
/* Tiling flags. */<br>
enum radeon_bo_layout {<br>
RADEON_LAYOUT_LINEAR = 0,<br>
RADEON_LAYOUT_TILED,<br>
RADEON_LAYOUT_SQUARETILED,<br>
diff --git a/src/gallium/drivers/<wbr>radeonsi/si_gfx_cs.c b/src/gallium/drivers/<wbr>radeonsi/si_gfx_cs.c<br>
index 2d5e510b19e..63bff29e63a 100644<br>
--- a/src/gallium/drivers/<wbr>radeonsi/si_gfx_cs.c<br>
+++ b/src/gallium/drivers/<wbr>radeonsi/si_gfx_cs.c<br>
@@ -62,25 +62,42 @@ void si_need_gfx_cs_space(struct si_context *ctx)<br>
unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_<wbr>suspend;<br>
if (!ctx->ws->cs_check_space(cs, need_dwords))<br>
si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);<br>
}<br>
<br>
void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,<br>
struct pipe_fence_handle **fence)<br>
{<br>
struct radeon_winsys_cs *cs = ctx->gfx_cs;<br>
struct radeon_winsys *ws = ctx->ws;<br>
+ unsigned wait_flags = 0;<br>
<br>
if (ctx->gfx_flush_in_progress)<br>
return;<br>
<br>
- if (!radeon_emitted(cs, ctx->initial_gfx_cs_size))<br>
+ if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) {<br>
+ /* DRM 3.1.0 doesn't flush TC for VI correctly. */<br>
+ wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |<br>
+ SI_CONTEXT_CS_PARTIAL_FLUSH |<br>
+ SI_CONTEXT_INV_GLOBAL_L2;<br>
+ } else if (ctx->chip_class == SI) {<br>
+ /* The kernel flushes L2 before shaders are finished. */<br>
+ wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |<br>
+ SI_CONTEXT_CS_PARTIAL_FLUSH;<br>
+ } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_<wbr>IB_NOW)) {<br>
+ wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |<br>
+ SI_CONTEXT_CS_PARTIAL_FLUSH;<br>
+ }<br>
+<br>
+ /* Drop this flush if it's a no-op. */<br>
+ if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&<br>
+ (!wait_flags || !ctx->gfx_last_ib_is_busy))<br>
return;<br>
<br>
if (si_check_device_reset(ctx))<br>
return;<br>
<br>
if (ctx->screen->debug_flags & DBG(CHECK_VM))<br>
flags &= ~PIPE_FLUSH_ASYNC;<br>
<br>
/* If the state tracker is flushing the GFX IB, si_flush_from_st is<br>
* responsible for flushing the DMA IB and merging the fences from both.<br>
@@ -96,27 +113,25 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,<br>
<br>
if (!LIST_IS_EMPTY(&ctx->active_<wbr>queries))<br>
si_suspend_queries(ctx);<br>
<br>
ctx->streamout.suspended = false;<br>
if (ctx->streamout.begin_emitted) {<br>
si_emit_streamout_end(ctx);<br>
ctx->streamout.suspended = true;<br>
}<br>
<br>
- ctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |<br>
- SI_CONTEXT_PS_PARTIAL_FLUSH;<br>
-<br>
- /* DRM 3.1.0 doesn't flush TC for VI correctly. */<br>
- if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1)<br>
- ctx->flags |= SI_CONTEXT_INV_GLOBAL_L2 |<br>
- SI_CONTEXT_INV_VMEM_L1;<br>
+ if (wait_flags) {<br>
+ ctx->flags |= wait_flags;<br>
+ si_emit_cache_flush(ctx);<br>
+ }<br>
+ ctx->gfx_last_ib_is_busy = wait_flags == 0;<br>
<br>
/* Make sure CP DMA is idle at the end of IBs after L2 prefetches<br>
* because the kernel doesn't wait for it. */<br>
if (ctx->chip_class >= CIK)<br>
si_cp_dma_wait_for_idle(ctx);<br>
<br>
if (ctx->current_saved_cs) {<br>
si_trace_emit(ctx);<br>
si_log_hw_flush(ctx);<br>
<br>
@@ -180,26 +195,35 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx)<br>
<br>
radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_<wbr>buf,<br>
RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);<br>
}<br>
<br>
void si_begin_new_gfx_cs(struct si_context *ctx)<br>
{<br>
if (ctx->is_debug)<br>
si_begin_gfx_cs_debug(ctx);<br>
<br>
- /* Flush read caches at the beginning of CS not flushed by the kernel. */<br>
- if (ctx->chip_class >= CIK)<br>
- ctx->flags |= SI_CONTEXT_INV_SMEM_L1 |<br>
- SI_CONTEXT_INV_ICACHE;<br>
-<br>
- ctx->flags |= SI_CONTEXT_START_PIPELINE_<wbr>STATS;<br>
+ /* Always invalidate caches at the beginning of IBs, because external<br>
+ * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our<br>
+ * buffers.<br>
+ *<br>
+ * Note that the cache flush done by the kernel at the end of GFX IBs<br>
+ * isn't useful here, because that flush can finish after the following<br>
+ * IB starts drawing.<br>
+ *<br>
+ * TODO: Do we also need to invalidate CB & DB caches?<br>
+ */<br>
+ ctx->flags |= SI_CONTEXT_INV_ICACHE |<br>
+ SI_CONTEXT_INV_SMEM_L1 |<br>
+ SI_CONTEXT_INV_VMEM_L1 |<br>
+ SI_CONTEXT_INV_GLOBAL_L2 |<br>
+ SI_CONTEXT_START_PIPELINE_<wbr>STATS;<br>
<br>
/* set all valid group as dirty so they get reemited on<br>
* next draw command<br>
*/<br>
si_pm4_reset_emitted(ctx);<br>
<br>
/* The CS initialization should be emitted before everything else. */<br>
si_pm4_emit(ctx, ctx->init_config);<br>
if (ctx->init_config_gs_rings)<br>
si_pm4_emit(ctx, ctx->init_config_gs_rings);<br>
diff --git a/src/gallium/drivers/<wbr>radeonsi/si_pipe.h b/src/gallium/drivers/<wbr>radeonsi/si_pipe.h<br>
index 0c90a6c6e46..f0f323ff3a7 100644<br>
--- a/src/gallium/drivers/<wbr>radeonsi/si_pipe.h<br>
+++ b/src/gallium/drivers/<wbr>radeonsi/si_pipe.h<br>
@@ -540,20 +540,21 @@ struct si_context {<br>
void *vs_blit_texcoord;<br>
struct si_screen *screen;<br>
struct pipe_debug_callback debug;<br>
LLVMTargetMachineRef tm; /* only non-threaded compilation */<br>
struct si_shader_ctx_state fixed_func_tcs_shader;<br>
struct r600_resource *wait_mem_scratch;<br>
unsigned wait_mem_number;<br>
uint16_t prefetch_L2_mask;<br>
<br>
bool gfx_flush_in_progress:1;<br>
+ bool gfx_last_ib_is_busy:1;<br>
bool compute_is_busy:1;<br>
<br>
unsigned num_gfx_cs_flushes;<br>
unsigned initial_gfx_cs_size;<br>
unsigned gpu_reset_counter;<br>
unsigned last_dirty_tex_counter;<br>
unsigned last_compressed_colortex_<wbr>counter;<br>
unsigned last_num_draw_calls;<br>
unsigned flags; /* flush flags */<br>
/* Current unaccounted memory usage. */<br>
<span class="HOEnZb"><font color="#888888">-- <br>
2.15.1<br>
<br>
</font></span></blockquote></div><br></div>