<div dir="ltr">Ping </div><div class="gmail_extra"> <div class="gmail_quote">On Fri, Apr 6, 2018 at 10:31 PM, Marek Olšák <<a href="mailto:maraeo@gmail.com" target="_blank">maraeo@gmail.com</a>> wrote: <blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">From: Marek Olšák <<a href="mailto:marek.olsak@amd.com">marek.olsak@amd.com</a>> (This patch doesn't enable the behavior. It will be enabled in a later commit.) Draw calls from multiple IBs can be executed in parallel. v2: do emit partial flushes on SI v3: invalidate all shader caches at the beginning of IBs v4: don't call si_emit_cache_flush in si_flush_gfx_cs if not needed, only do this for flushes invoked internally v5: empty IBs should wait for idle if the flush requires it v6: split the commit If we artificially limit the number of draw calls per IB to 5, we'll get a lot more IBs, leading to a lot more partial flushes. Let's see how the removal of partial flushes changes GPU utilization in that scenario: With partial flushes (time busy): CP: 99% SPI: 86% CB: 73: Without partial flushes (time busy): CP: 99% SPI: 93% CB: 81% --- src/gallium/drivers/radeon/radeon_winsys.h | 7 ++++ src/gallium/drivers/radeonsi/si_gfx_cs.c | 52 ++++++++++++++++++++++-------- src/gallium/drivers/radeonsi/si_pipe.h | 1 + 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 157b2e40550..fae4fb7a95d 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -21,20 +21,27 @@ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef RADEON_WINSYS_H #define RADEON_WINSYS_H /* The public winsys interface header for the radeon driver. */ +/* Whether the next IB can start immediately and not wait for draws and + * dispatches from the current IB to finish. */ +#define RADEON_FLUSH_START_NEXT_GFX_IB_NOW (1u << 31) + +#define RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW \ + (PIPE_FLUSH_ASYNC | RADEON_FLUSH_START_NEXT_GFX_IB_NOW) + #include "pipebuffer/pb_buffer.h" #include "amd/common/ac_gpu_info.h" #include "amd/common/ac_surface.h" /* Tiling flags. */ enum radeon_bo_layout { RADEON_LAYOUT_LINEAR = 0, RADEON_LAYOUT_TILED, RADEON_LAYOUT_SQUARETILED, diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 2d5e510b19e..63bff29e63a 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -62,25 +62,42 @@ void si_need_gfx_cs_space(struct si_context *ctx) unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend; if (!ctx->ws->cs_check_space(cs, need_dwords)) si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL); } void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct radeon_winsys_cs *cs = ctx->gfx_cs; struct radeon_winsys *ws = ctx->ws; + unsigned wait_flags = 0; if (ctx->gfx_flush_in_progress) return; - if (!radeon_emitted(cs, ctx->initial_gfx_cs_size)) + if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) { + /* DRM 3.1.0 doesn't flush TC for VI correctly. */ + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | + SI_CONTEXT_INV_GLOBAL_L2; + } else if (ctx->chip_class == SI) { + /* The kernel flushes L2 before shaders are finished. */ + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } + + /* Drop this flush if it's a no-op. */ + if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && + (!wait_flags || !ctx->gfx_last_ib_is_busy)) return; if (si_check_device_reset(ctx)) return; if (ctx->screen->debug_flags & DBG(CHECK_VM)) flags &= ~PIPE_FLUSH_ASYNC; /* If the state tracker is flushing the GFX IB, si_flush_from_st is * responsible for flushing the DMA IB and merging the fences from both. @@ -96,27 +113,25 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, if (!LIST_IS_EMPTY(&ctx->active_queries)) si_suspend_queries(ctx); ctx->streamout.suspended = false; if (ctx->streamout.begin_emitted) { si_emit_streamout_end(ctx); ctx->streamout.suspended = true; } - ctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_PS_PARTIAL_FLUSH; - - /* DRM 3.1.0 doesn't flush TC for VI correctly. */ - if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) - ctx->flags |= SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_INV_VMEM_L1; + if (wait_flags) { + ctx->flags |= wait_flags; + si_emit_cache_flush(ctx); + } + ctx->gfx_last_ib_is_busy = wait_flags == 0; /* Make sure CP DMA is idle at the end of IBs after L2 prefetches * because the kernel doesn't wait for it. */ if (ctx->chip_class >= CIK) si_cp_dma_wait_for_idle(ctx); if (ctx->current_saved_cs) { si_trace_emit(ctx); si_log_hw_flush(ctx); @@ -180,26 +195,35 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx) radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf, RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); } void si_begin_new_gfx_cs(struct si_context *ctx) { if (ctx->is_debug) si_begin_gfx_cs_debug(ctx); - /* Flush read caches at the beginning of CS not flushed by the kernel. */ - if (ctx->chip_class >= CIK) - ctx->flags |= SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_ICACHE; - - ctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; + /* Always invalidate caches at the beginning of IBs, because external + * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our + * buffers. + * + * Note that the cache flush done by the kernel at the end of GFX IBs + * isn't useful here, because that flush can finish after the following + * IB starts drawing. + * + * TODO: Do we also need to invalidate CB & DB caches? + */ + ctx->flags |= SI_CONTEXT_INV_ICACHE | + SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_START_PIPELINE_STATS; /* set all valid group as dirty so they get reemited on * next draw command */ si_pm4_reset_emitted(ctx); /* The CS initialization should be emitted before everything else. */ si_pm4_emit(ctx, ctx->init_config); if (ctx->init_config_gs_rings) si_pm4_emit(ctx, ctx->init_config_gs_rings); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 0c90a6c6e46..f0f323ff3a7 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -540,20 +540,21 @@ struct si_context { void *vs_blit_texcoord; struct si_screen *screen; struct pipe_debug_callback debug; LLVMTargetMachineRef tm; /* only non-threaded compilation */ struct si_shader_ctx_state fixed_func_tcs_shader; struct r600_resource *wait_mem_scratch; unsigned wait_mem_number; uint16_t prefetch_L2_mask; bool gfx_flush_in_progress:1; + bool gfx_last_ib_is_busy:1; bool compute_is_busy:1; unsigned num_gfx_cs_flushes; unsigned initial_gfx_cs_size; unsigned gpu_reset_counter; unsigned last_dirty_tex_counter; unsigned last_compressed_colortex_counter; unsigned last_num_draw_calls; unsigned flags; /* flush flags */ /* Current unaccounted memory usage. */ -- 2.15.1 </blockquote></div> </div>