[Mesa-dev] [PATCH 09/12] anv/cmd_buffer: Add code for performing HZ operations

Jason Ekstrand jason at jlekstrand.net
Sat Sep 3 00:01:28 UTC 2016


On Wed, Aug 31, 2016 at 8:29 PM, Nanley Chery <nanleychery at gmail.com> wrote:

> From: Jason Ekstrand <jason.ekstrand at intel.com>
>

First off, this is your patch not mine.  The patch of mine you based this
on was little more than a skeleton that demonstrated how to use
PIPE_CONTROL.  All of the interesting stuff in here is yours.


> Nanley Chery:
> (rebase)
>  - Resolve conflicts with the new anv_batch_emit macro
> (amend)
>  - Update commit title
>  - Combine all HZ operations into one function
>  - Add code for performing HiZ resolve operations
>  - Add proper stencil and multisampling support
>  - Set the proper clear rectangles
>  - Add required cases for aborting an HZ operation
>
> Signed-off-by: Nanley Chery <nanley.g.chery at intel.com>
> ---
>  src/intel/vulkan/anv_genX.h        |   3 +
>  src/intel/vulkan/anv_private.h     |   6 ++
>  src/intel/vulkan/gen7_cmd_buffer.c |   5 ++
>  src/intel/vulkan/gen8_cmd_buffer.c | 124 ++++++++++++++++++++++++++++++
> +++++++
>  4 files changed, 138 insertions(+)
>
> diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
> index cf5a232..16de990 100644
> --- a/src/intel/vulkan/anv_genX.h
> +++ b/src/intel/vulkan/anv_genX.h
> @@ -54,6 +54,9 @@ void genX(cmd_buffer_flush_dynamic_state)(struct
> anv_cmd_buffer *cmd_buffer);
>
>  void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer
> *cmd_buffer);
>
> +void genX(cmd_buffer_do_hz_op)(struct anv_cmd_buffer *cmd_buffer,
> +                               enum anv_hz_op op);
> +
>  VkResult
>  genX(graphics_pipeline_create)(VkDevice _device,
>                                 struct anv_pipeline_cache *cache,
> diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_
> private.h
> index 5718a19..40325fd 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1401,6 +1401,12 @@ anv_cmd_buffer_get_depth_stencil_view(const struct
> anv_cmd_buffer *cmd_buffer);
>
>  void anv_cmd_buffer_dump(struct anv_cmd_buffer *cmd_buffer);
>
> +enum anv_hz_op {
> +   ANV_HZ_OP_CLEAR,
> +   ANV_HZ_OP_HIZ_RESOLVE,
> +   ANV_HZ_OP_DEPTH_RESOLVE,
> +};
>

Now that blorp is in its own folder, we could use the blorp_hiz_op enum
instead of rolling our own.  That'll make it easier to add gen7 support.


> +
>  struct anv_fence {
>     struct anv_bo bo;
>     struct drm_i915_gem_execbuffer2 execbuf;
> diff --git a/src/intel/vulkan/gen7_cmd_buffer.c
> b/src/intel/vulkan/gen7_cmd_buffer.c
> index 61778aa..a057a04 100644
> --- a/src/intel/vulkan/gen7_cmd_buffer.c
> +++ b/src/intel/vulkan/gen7_cmd_buffer.c
> @@ -323,6 +323,11 @@ genX(cmd_buffer_flush_dynamic_state)(struct
> anv_cmd_buffer *cmd_buffer)
>     cmd_buffer->state.dirty = 0;
>  }
>
> +void
> +genX(cmd_buffer_do_hz_op)(struct anv_cmd_buffer *cmd_buffer, enum
> anv_hz_op op)
> +{
> +}
> +
>  void genX(CmdSetEvent)(
>      VkCommandBuffer                             commandBuffer,
>      VkEvent                                     event,
> diff --git a/src/intel/vulkan/gen8_cmd_buffer.c
> b/src/intel/vulkan/gen8_cmd_buffer.c
> index e22b4e2..4f27350 100644
> --- a/src/intel/vulkan/gen8_cmd_buffer.c
> +++ b/src/intel/vulkan/gen8_cmd_buffer.c
> @@ -399,6 +399,130 @@ genX(cmd_buffer_flush_compute_state)(struct
> anv_cmd_buffer *cmd_buffer)
>     genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
>  }
>
> +
> +/**
> + * Emit the HZ_OP packet in the sequence specified by the BDW PRM section
> + * entitled: "Optimized Depth Buffer Clear and/or Stencil Buffer Clear."
> + */
> +void
> +genX(cmd_buffer_do_hz_op)(struct anv_cmd_buffer *cmd_buffer, enum
> anv_hz_op op)
> +{
> +   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
> +   const struct anv_image_view *iview =
> +      anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
> +
> +   if (iview == NULL || !anv_image_has_hiz(iview->image))
> +      return;
>

This looks like something that would be better as an assert.  Silently
doing nothing is probably fine for resolves.  For clears on the other hand,
it means silently *not* clearing which would be bad.


> +
> +   const uint32_t ds = cmd_state->subpass->depth_stencil_attachment;
> +   const bool full_surface_op =
> +             cmd_state->render_area.extent.width == iview->extent.width
> &&
> +             cmd_state->render_area.extent.height ==
> iview->extent.height;
>

I think you also need render_area.offset == 0 checks?


> +
> +   /* Validate that we can perform the HZ operation and that it's
> necessary. */
> +   switch (op) {
> +   case ANV_HZ_OP_CLEAR:
> +      if (cmd_buffer->state.pass->attachments[ds].load_op !=
> +          VK_ATTACHMENT_LOAD_OP_CLEAR)
> +         return;
> +
> +      /* Apply alignment restrictions. For a sample count of 16, the
> formulas
> +       * reduce to identity and indicate that no alignment is required.
> +       */
>

I think you're being a bit over-restrictive here.  From the Sky Lake PRM
Vol. 7 "Hierarchical Depth Buffer" (p. 626 in my pdf):

"The minimum granularity of clear is one pixel, but all samples of the
pixel must be cleared. Clearing partial samples of a pixel is not
supported. If a newly allocated depth buffer is not padded to an integer
multiple of 8x4 pixels, and if the first operation on the depth buffer does
not clear the entire width and height of the surface, then first a HiZ
ambiguate must be done on the portions of the depth buffer that are not
cleared. If the depth buffer clear operation does clear the entire width
and height of the surface, then the “full surface clear” bit in
3DSTATE_WM_OP
must be set to 1."

In other words, the granularity of the HiZ operation is 1 pixel.  However,
you need to ensure that the the HiZ buffer is valid for all 8x4 blocks or
else strange things may happen.  We already do this with a memset at
BindImageMemory time so our HiZ buffer should always contain valid data.

In the Broadwell PRM, same section, it does list that alignment requirement
but says that it only applies to D16_UNORM surfaces.  For all other surface
formats, Broadwell appears to follow the Sky Lake rules where, as long as
the HiZ buffer contains valid data, everything goes.

I think we need a crucible test for partial (maybe even multisampled) depth
clears.


> +      if (!full_surface_op && iview->image->samples < 16) {
>

If my understanding of Sky Lake is correct, the samples < 16 doesn't matter
because the alignment restrictions don't apply to Sky Lake anyway.


> +         uint32_t align_w = 1;
> +         uint32_t align_h = 1;
> +
> +         if (iview->image->samples > 1) {
> +            isl_msaa_interleaved_scale_px_to_sa(iview->image->samples,
> +                                                &align_w, &align_h);
> +         }
> +
> +         align_w = 8 / align_w;
> +         align_h = 4 / align_h;
> +
> +         if (cmd_state->render_area.offset.x % align_w ||
> +             cmd_state->render_area.offset.y % align_h ||
> +             cmd_state->render_area.extent.width % align_w ||
> +             cmd_state->render_area.extent.height % align_h)
> +            return;
> +      }
> +      break;
> +   case ANV_HZ_OP_DEPTH_RESOLVE:
> +      if (cmd_buffer->state.pass->attachments[ds].store_op !=
> +          VK_ATTACHMENT_STORE_OP_STORE)
> +         return;
> +      break;
> +   case ANV_HZ_OP_HIZ_RESOLVE:
> +      if (cmd_buffer->state.pass->attachments[ds].load_op !=
> +          VK_ATTACHMENT_LOAD_OP_LOAD)
> +         return;
> +      break;
> +   }
> +
> +   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_HZ_OP), hzp) {
> +      switch (op) {
> +      case ANV_HZ_OP_CLEAR:
> +         hzp.StencilBufferClearEnable = VK_IMAGE_ASPECT_STENCIL_BIT &
> +                            cmd_state->attachments[ds].
> pending_clear_aspects;
> +         hzp.DepthBufferClearEnable = VK_IMAGE_ASPECT_DEPTH_BIT &
> +                            cmd_state->attachments[ds].
> pending_clear_aspects;
> +         hzp.FullSurfaceDepthandStencilClear = full_surface_op;
> +         hzp.StencilClearValue = 0xff &
> +                   cmd_state->attachments[ds].clear_value.depthStencil.
> stencil;
> +
> +         /* Mark aspects as cleared */
> +         cmd_state->attachments[ds].pending_clear_aspects = 0;
> +         break;
> +      case ANV_HZ_OP_DEPTH_RESOLVE:
> +         hzp.DepthBufferResolveEnable = true;
> +         break;
> +      case ANV_HZ_OP_HIZ_RESOLVE:
> +         hzp.HierarchicalDepthBufferResolveEnable = true;
> +         break;
> +      }
> +
> +      /* The depth resolve rectangle must match the size of the previous
> clear
> +       * rectangle.
> +       *
> +       * The HiZ resolve rectangle is specified as needing to be the
> +       * size of the full RT and aligned to 8x4, these requirements are in
> +       * conflict if the RT extent is not a multiple of 8x4. Testing shows
> +       * that setting the rectangle to match the render area works just
> fine.
> +       *
> +       * In a manner similar to i965, we'd like to diverge from the PRMs
> here
> +       * to reduce the number of HiZ blocks written to.
>

Not aligning to 8x4 shouldn't affect the number of HiZ blocks written to.
Maybe I'm confused :/


> +       */
> +      hzp.ClearRectangleXMin = anv_minify(cmd_state->render_
> area.offset.x,
> +                                          iview->base_mip);
> +      hzp.ClearRectangleYMin = anv_minify(cmd_state->render_
> area.offset.y,
> +                                          iview->base_mip);
> +      hzp.ClearRectangleXMax = anv_minify(cmd_state->render_area.offset.x
> +
> +                                          cmd_state->render_area.extent.
> width,
> +                                          iview->base_mip);
> +      hzp.ClearRectangleYMax = anv_minify(cmd_state->render_area.offset.y
> +
> +                                          cmd_state->render_area.extent.
> height,
> +                                          iview->base_mip);
>

This isn't correct.  The size of the render area should already take into
account the minification.  When you render into a non-zero LOD of an image,
the size of your framebuffer is the size of that LOD and the render area
must be a subset of that.


> +
> +      /* Due to a hardware issue, this bit MBZ */
> +      hzp.ScissorRectangleEnable = false;
> +      hzp.NumberofMultisamples = ffs(iview->image->samples) - 1;
> +      hzp.SampleMask = 0xFFFF;
> +   }
> +
> +   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> +      pc.PostSyncOperation = WriteImmediateData;
> +      pc.Address =
> +         (struct anv_address){ &cmd_buffer->device->workaround_bo, 0 };
> +   }
> +
> +   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_HZ_OP), hzp);
> +
> +   /* TODO: Determine if a DepthFlush and DepthStall is really necessary
> for
> +    *       non-full_surface_op clears.
> +    */
> +}
> +
>  void genX(CmdSetEvent)(
>      VkCommandBuffer                             commandBuffer,
>      VkEvent                                     _event,
> --
> 2.9.3
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20160902/a46105b6/attachment-0001.html>


More information about the mesa-dev mailing list