<div dir="ltr">Ping<br></div><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Apr 6, 2018 at 10:31 PM, Marek Olšák <span dir="ltr"><<a href="mailto:maraeo@gmail.com" target="_blank">maraeo@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">From: Marek Olšák <<a href="mailto:marek.olsak@amd.com">marek.olsak@amd.com</a>><br>
<br>
(This patch doesn't enable the behavior. It will be enabled in a later<br>
commit.)<br>
<br>
Draw calls from multiple IBs can be executed in parallel.<br>
<br>
v2: do emit partial flushes on SI<br>
v3: invalidate all shader caches at the beginning of IBs<br>
v4: don't call si_emit_cache_flush in si_flush_gfx_cs if not needed,<br>
    only do this for flushes invoked internally<br>
v5: empty IBs should wait for idle if the flush requires it<br>
v6: split the commit<br>
<br>
If we artificially limit the number of draw calls per IB to 5, we'll get<br>
a lot more IBs, leading to a lot more partial flushes. Let's see how<br>
the removal of partial flushes changes GPU utilization in that scenario:<br>
<br>
With partial flushes (time busy):<br>
    CP: 99%<br>
    SPI: 86%<br>
    CB: 73:<br>
<br>
Without partial flushes (time busy):<br>
    CP: 99%<br>
    SPI: 93%<br>
    CB: 81%<br>
---<br>
 src/gallium/drivers/radeon/<wbr>radeon_winsys.h |  7 ++++<br>
 src/gallium/drivers/radeonsi/<wbr>si_gfx_cs.c   | 52 ++++++++++++++++++++++--------<br>
 src/gallium/drivers/radeonsi/<wbr>si_pipe.h     |  1 +<br>
 3 files changed, 46 insertions(+), 14 deletions(-)<br>
<br>
diff --git a/src/gallium/drivers/radeon/<wbr>radeon_winsys.h b/src/gallium/drivers/radeon/<wbr>radeon_winsys.h<br>
index 157b2e40550..fae4fb7a95d 100644<br>
--- a/src/gallium/drivers/radeon/<wbr>radeon_winsys.h<br>
+++ b/src/gallium/drivers/radeon/<wbr>radeon_winsys.h<br>
@@ -21,20 +21,27 @@<br>
  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,<br>
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR<br>
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE<br>
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */<br>
<br>
 #ifndef RADEON_WINSYS_H<br>
 #define RADEON_WINSYS_H<br>
<br>
 /* The public winsys interface header for the radeon driver. */<br>
<br>
+/* Whether the next IB can start immediately and not wait for draws and<br>
+ * dispatches from the current IB to finish. */<br>
+#define RADEON_FLUSH_START_NEXT_GFX_<wbr>IB_NOW     (1u << 31)<br>
+<br>
+#define RADEON_FLUSH_ASYNC_START_NEXT_<wbr>GFX_IB_NOW \<br>
+       (PIPE_FLUSH_ASYNC | RADEON_FLUSH_START_NEXT_GFX_<wbr>IB_NOW)<br>
+<br>
 #include "pipebuffer/pb_buffer.h"<br>
<br>
 #include "amd/common/ac_gpu_info.h"<br>
 #include "amd/common/ac_surface.h"<br>
<br>
 /* Tiling flags. */<br>
 enum radeon_bo_layout {<br>
     RADEON_LAYOUT_LINEAR = 0,<br>
     RADEON_LAYOUT_TILED,<br>
     RADEON_LAYOUT_SQUARETILED,<br>
diff --git a/src/gallium/drivers/<wbr>radeonsi/si_gfx_cs.c b/src/gallium/drivers/<wbr>radeonsi/si_gfx_cs.c<br>
index 2d5e510b19e..63bff29e63a 100644<br>
--- a/src/gallium/drivers/<wbr>radeonsi/si_gfx_cs.c<br>
+++ b/src/gallium/drivers/<wbr>radeonsi/si_gfx_cs.c<br>
@@ -62,25 +62,42 @@ void si_need_gfx_cs_space(struct si_context *ctx)<br>
        unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_<wbr>suspend;<br>
        if (!ctx->ws->cs_check_space(cs, need_dwords))<br>
                si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);<br>
 }<br>
<br>
 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,<br>
                     struct pipe_fence_handle **fence)<br>
 {<br>
        struct radeon_winsys_cs *cs = ctx->gfx_cs;<br>
        struct radeon_winsys *ws = ctx->ws;<br>
+       unsigned wait_flags = 0;<br>
<br>
        if (ctx->gfx_flush_in_progress)<br>
                return;<br>
<br>
-       if (!radeon_emitted(cs, ctx->initial_gfx_cs_size))<br>
+       if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) {<br>
+               /* DRM 3.1.0 doesn't flush TC for VI correctly. */<br>
+               wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |<br>
+                             SI_CONTEXT_CS_PARTIAL_FLUSH |<br>
+                             SI_CONTEXT_INV_GLOBAL_L2;<br>
+       } else if (ctx->chip_class == SI) {<br>
+               /* The kernel flushes L2 before shaders are finished. */<br>
+               wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |<br>
+                             SI_CONTEXT_CS_PARTIAL_FLUSH;<br>
+       } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_<wbr>IB_NOW)) {<br>
+               wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |<br>
+                             SI_CONTEXT_CS_PARTIAL_FLUSH;<br>
+       }<br>
+<br>
+       /* Drop this flush if it's a no-op. */<br>
+       if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&<br>
+           (!wait_flags || !ctx->gfx_last_ib_is_busy))<br>
                return;<br>
<br>
        if (si_check_device_reset(ctx))<br>
                return;<br>
<br>
        if (ctx->screen->debug_flags & DBG(CHECK_VM))<br>
                flags &= ~PIPE_FLUSH_ASYNC;<br>
<br>
        /* If the state tracker is flushing the GFX IB, si_flush_from_st is<br>
         * responsible for flushing the DMA IB and merging the fences from both.<br>
@@ -96,27 +113,25 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,<br>
<br>
        if (!LIST_IS_EMPTY(&ctx->active_<wbr>queries))<br>
                si_suspend_queries(ctx);<br>
<br>
        ctx->streamout.suspended = false;<br>
        if (ctx->streamout.begin_emitted) {<br>
                si_emit_streamout_end(ctx);<br>
                ctx->streamout.suspended = true;<br>
        }<br>
<br>
-       ctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |<br>
-                       SI_CONTEXT_PS_PARTIAL_FLUSH;<br>
-<br>
-       /* DRM 3.1.0 doesn't flush TC for VI correctly. */<br>
-       if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1)<br>
-               ctx->flags |= SI_CONTEXT_INV_GLOBAL_L2 |<br>
-                               SI_CONTEXT_INV_VMEM_L1;<br>
+       if (wait_flags) {<br>
+               ctx->flags |= wait_flags;<br>
+               si_emit_cache_flush(ctx);<br>
+       }<br>
+       ctx->gfx_last_ib_is_busy = wait_flags == 0;<br>
<br>
        /* Make sure CP DMA is idle at the end of IBs after L2 prefetches<br>
         * because the kernel doesn't wait for it. */<br>
        if (ctx->chip_class >= CIK)<br>
                si_cp_dma_wait_for_idle(ctx);<br>
<br>
        if (ctx->current_saved_cs) {<br>
                si_trace_emit(ctx);<br>
                si_log_hw_flush(ctx);<br>
<br>
@@ -180,26 +195,35 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx)<br>
<br>
        radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_<wbr>buf,<br>
                              RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);<br>
 }<br>
<br>
 void si_begin_new_gfx_cs(struct si_context *ctx)<br>
 {<br>
        if (ctx->is_debug)<br>
                si_begin_gfx_cs_debug(ctx);<br>
<br>
-       /* Flush read caches at the beginning of CS not flushed by the kernel. */<br>
-       if (ctx->chip_class >= CIK)<br>
-               ctx->flags |= SI_CONTEXT_INV_SMEM_L1 |<br>
-                               SI_CONTEXT_INV_ICACHE;<br>
-<br>
-       ctx->flags |= SI_CONTEXT_START_PIPELINE_<wbr>STATS;<br>
+       /* Always invalidate caches at the beginning of IBs, because external<br>
+        * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our<br>
+        * buffers.<br>
+        *<br>
+        * Note that the cache flush done by the kernel at the end of GFX IBs<br>
+        * isn't useful here, because that flush can finish after the following<br>
+        * IB starts drawing.<br>
+        *<br>
+        * TODO: Do we also need to invalidate CB & DB caches?<br>
+        */<br>
+       ctx->flags |= SI_CONTEXT_INV_ICACHE |<br>
+                     SI_CONTEXT_INV_SMEM_L1 |<br>
+                     SI_CONTEXT_INV_VMEM_L1 |<br>
+                     SI_CONTEXT_INV_GLOBAL_L2 |<br>
+                     SI_CONTEXT_START_PIPELINE_<wbr>STATS;<br>
<br>
        /* set all valid group as dirty so they get reemited on<br>
         * next draw command<br>
         */<br>
        si_pm4_reset_emitted(ctx);<br>
<br>
        /* The CS initialization should be emitted before everything else. */<br>
        si_pm4_emit(ctx, ctx->init_config);<br>
        if (ctx->init_config_gs_rings)<br>
                si_pm4_emit(ctx, ctx->init_config_gs_rings);<br>
diff --git a/src/gallium/drivers/<wbr>radeonsi/si_pipe.h b/src/gallium/drivers/<wbr>radeonsi/si_pipe.h<br>
index 0c90a6c6e46..f0f323ff3a7 100644<br>
--- a/src/gallium/drivers/<wbr>radeonsi/si_pipe.h<br>
+++ b/src/gallium/drivers/<wbr>radeonsi/si_pipe.h<br>
@@ -540,20 +540,21 @@ struct si_context {<br>
        void                            *vs_blit_texcoord;<br>
        struct si_screen                *screen;<br>
        struct pipe_debug_callback      debug;<br>
        LLVMTargetMachineRef            tm; /* only non-threaded compilation */<br>
        struct si_shader_ctx_state      fixed_func_tcs_shader;<br>
        struct r600_resource            *wait_mem_scratch;<br>
        unsigned                        wait_mem_number;<br>
        uint16_t                        prefetch_L2_mask;<br>
<br>
        bool                            gfx_flush_in_progress:1;<br>
+       bool                            gfx_last_ib_is_busy:1;<br>
        bool                            compute_is_busy:1;<br>
<br>
        unsigned                        num_gfx_cs_flushes;<br>
        unsigned                        initial_gfx_cs_size;<br>
        unsigned                        gpu_reset_counter;<br>
        unsigned                        last_dirty_tex_counter;<br>
        unsigned                        last_compressed_colortex_<wbr>counter;<br>
        unsigned                        last_num_draw_calls;<br>
        unsigned                        flags; /* flush flags */<br>
        /* Current unaccounted memory usage. */<br>
<span class="HOEnZb"><font color="#888888">-- <br>
2.15.1<br>
<br>
</font></span></blockquote></div><br></div>