[Mesa-dev] [PATCH 4/4] OPTIONAL: anv/gen9: Optimize slice and subslice load balancing behavior.
Jason Ekstrand
jason at jlekstrand.net
Sat Aug 10 13:57:18 UTC 2019
This gets us +2.7% on Aztec Ruins (5 runs on each branch)
On Sat, Aug 10, 2019 at 7:31 AM Jason Ekstrand <jason at jlekstrand.net> wrote:
> Let's hold this for at least a tiny bit. I'll run some benchmarks and may
> want to tweak how in interacts with the rest of the vulkan state
> tracking.
> Thanks for figuring out a heuristic!
>
> --Jason
>
> On August 9, 2019 19:22:48 Francisco Jerez <currojerez at riseup.net> wrote:
>
> > See "i965/gen9: Optimize slice and subslice load balancing behavior."
> > for the rationale. Marked optional because no performance evaluation
> > has been done on this commit, it is provided to match the hashing
> > settings of the Iris driver. Test reports welcome.
> > ---
> > src/intel/vulkan/anv_genX.h | 4 ++
> > src/intel/vulkan/anv_private.h | 6 ++
> > src/intel/vulkan/genX_blorp_exec.c | 6 ++
> > src/intel/vulkan/genX_cmd_buffer.c | 96 ++++++++++++++++++++++++++++++
> > 4 files changed, 112 insertions(+)
> >
> > diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
> > index a5435e566a3..06c6b467acf 100644
> > --- a/src/intel/vulkan/anv_genX.h
> > +++ b/src/intel/vulkan/anv_genX.h
> > @@ -44,6 +44,10 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct
> > anv_cmd_buffer *cmd_buffer);
> >
> > void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer
> *cmd_buffer);
> >
> > +void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer
> *cmd_buffer,
> > + unsigned width, unsigned height,
> > + unsigned scale);
> > +
> > void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer);
> > void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer
> *cmd_buffer);
> >
> > diff --git a/src/intel/vulkan/anv_private.h
> b/src/intel/vulkan/anv_private.h
> > index 2465f264354..b381386a716 100644
> > --- a/src/intel/vulkan/anv_private.h
> > +++ b/src/intel/vulkan/anv_private.h
> > @@ -2421,6 +2421,12 @@ struct anv_cmd_state {
> >
> > bool
> conditional_render_enabled;
> >
> > + /**
> > + * Last rendering scale argument provided to
> > + * genX(cmd_buffer_emit_hashing_mode)().
> > + */
> > + unsigned current_hash_scale;
> > +
> > /**
> > * Array length is anv_cmd_state::pass::attachment_count. Array
> content is
> > * valid only when recording a render pass instance.
> > diff --git a/src/intel/vulkan/genX_blorp_exec.c
> > b/src/intel/vulkan/genX_blorp_exec.c
> > index 1592e7f7e3d..e9eedc06696 100644
> > --- a/src/intel/vulkan/genX_blorp_exec.c
> > +++ b/src/intel/vulkan/genX_blorp_exec.c
> > @@ -223,6 +223,12 @@ genX(blorp_exec)(struct blorp_batch *batch,
> > genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
> > }
> >
> > + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
> > + if (cmd_buffer->state.current_hash_scale != scale) {
> > + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 -
> params->x0,
> > + params->y1 - params->y0,
> scale);
> > + }
> > +
> > #if GEN_GEN >= 11
> > /* The PIPE_CONTROL command description says:
> > *
> > diff --git a/src/intel/vulkan/genX_cmd_buffer.c
> > b/src/intel/vulkan/genX_cmd_buffer.c
> > index 86ef1663ac4..e9e5570d49f 100644
> > --- a/src/intel/vulkan/genX_cmd_buffer.c
> > +++ b/src/intel/vulkan/genX_cmd_buffer.c
> > @@ -1595,6 +1595,7 @@ genX(CmdExecuteCommands)(
> > */
> > primary->state.current_pipeline = UINT32_MAX;
> > primary->state.current_l3_config = NULL;
> > + primary->state.current_hash_scale = 0;
> >
> > /* Each of the secondary command buffers will use its own state base
> > * address. We need to re-emit state base address for the primary
> after
> > @@ -2663,6 +2664,9 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer
> > *cmd_buffer)
> >
> > genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
> >
> > + if (cmd_buffer->state.current_hash_scale != 1)
> > + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX,
> UINT_MAX, 1);
> > +
> > genX(flush_pipeline_select_3d)(cmd_buffer);
> >
> > if (vb_emit) {
> > @@ -3925,6 +3929,98 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct
> > anv_cmd_buffer *cmd_buffer)
> > }
> > }
> >
> > +/**
> > + * Update the pixel hashing modes that determine the balancing of PS
> threads
> > + * across subslices and slices.
> > + *
> > + * \param width Width bound of the rendering area (already scaled down
> if \p
> > + * scale is greater than 1).
> > + * \param height Height bound of the rendering area (already scaled
> down if \p
> > + * scale is greater than 1).
> > + * \param scale The number of framebuffer samples that could
> potentially be
> > + * affected by an individual channel of the PS thread.
> This is
> > + * typically one for single-sampled rendering, but for
> operations
> > + * like CCS resolves and fast clears a single PS
> invocation may
> > + * update a huge number of pixels, in which case a finer
> > + * balancing is desirable in order to maximally utilize the
> > + * bandwidth available. UINT_MAX can be used as shorthand
> for
> > + * "finest hashing mode available".
> > + */
> > +void
> > +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
> > + unsigned width, unsigned height,
> > + unsigned scale)
> > +{
> > +#if GEN_GEN == 9
> > + const struct gen_device_info *devinfo = &cmd_buffer->device->info;
> > + const unsigned slice_hashing[] = {
> > + /* Because all Gen9 platforms with more than one slice require
> > + * three-way subslice hashing, a single "normal" 16x16 slice
> hashing
> > + * block is guaranteed to suffer from substantial imbalance, with
> one
> > + * subslice receiving twice as much work as the other two in the
> > + * slice.
> > + *
> > + * The performance impact of that would be particularly severe
> when
> > + * three-way hashing is also in use for slice balancing (which is
> the
> > + * case for all Gen9 GT4 platforms), because one of the slices
> > + * receives one every three 16x16 blocks in either direction,
> which
> > + * is roughly the periodicity of the underlying subslice imbalance
> > + * pattern ("roughly" because in reality the hardware's
> > + * implementation of three-way hashing doesn't do exact modulo 3
> > + * arithmetic, which somewhat decreases the magnitude of this
> effect
> > + * in practice). This leads to a systematic subslice imbalance
> > + * within that slice regardless of the size of the primitive. The
> > + * 32x32 hashing mode guarantees that the subslice imbalance
> within a
> > + * single slice hashing block is minimal, largely eliminating this
> > + * effect.
> > + */
> > + _32x32,
> > + /* Finest slice hashing mode available. */
> > + NORMAL
> > + };
> > + const unsigned subslice_hashing[] = {
> > + /* 16x16 would provide a slight cache locality benefit especially
> > + * visible in the sampler L1 cache efficiency of low-bandwidth
> > + * non-LLC platforms, but it comes at the cost of greater subslice
> > + * imbalance for primitives of dimensions approximately
> intermediate
> > + * between 16x4 and 16x16.
> > + */
> > + _16x4,
> > + /* Finest subslice hashing mode available. */
> > + _8x4
> > + };
> > + /* Dimensions of the smallest hashing block of a given hashing
> mode. If
> > + * the rendering area is smaller than this there can't possibly be
> any
> > + * benefit from switching to this mode, so we optimize out the
> > + * transition.
> > + */
> > + const unsigned min_size[][2] = {
> > + { 16, 4 },
> > + { 8, 4 }
> > + };
> > + const unsigned idx = scale > 1;
> > +
> > + if (width > min_size[idx][0] || height > min_size[idx][1]) {
> > + uint32_t gt_mode;
> > +
> > + anv_pack_struct(>_mode, GENX(GT_MODE),
> > + .SliceHashing = (devinfo->num_slices > 1 ?
> > slice_hashing[idx] : 0),
> > + .SliceHashingMask = (devinfo->num_slices > 1 ? -1
> : 0),
> > + .SubsliceHashing = subslice_hashing[idx],
> > + .SubsliceHashingMask = -1);
> > +
> > + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> > + pc.StallAtPixelScoreboard = true;
> > + pc.CommandStreamerStallEnable = true;
> > + }
> > +
> > + emit_lri(&cmd_buffer->batch, GENX(GT_MODE_num), gt_mode);
> > +
> > + cmd_buffer->state.current_hash_scale = scale;
> > + }
> > +#endif
> > +}
> > +
> > static void
> > cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
> > {
> > --
> > 2.22.0
> >
> > _______________________________________________
> > mesa-dev mailing list
> > mesa-dev at lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20190810/6661c674/attachment-0001.html>
More information about the mesa-dev
mailing list