<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Wed, Jul 19, 2017 at 2:22 PM, Nanley Chery <span dir="ltr"><<a href="mailto:nanleychery@gmail.com" target="_blank">nanleychery@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span class="">Image layouts only let us know that an image *may* be fast-cleared. For<br>
this reason we can end up with redundant resolves. Testing has shown<br>
that such resolves can measurably hurt performance and that predicating<br>
them can avoid the penalty.<br>
<br>
</span>v2:<br>
- Introduce additional resolve state management function (Jason Ekstrand).<br>
- Enable easy retrieval of fast clear state fields.<br>
<span class=""><br>
Signed-off-by: Nanley Chery <<a href="mailto:nanley.g.chery@intel.com">nanley.g.chery@intel.com</a>><br>
---<br>
 src/intel/vulkan/anv_blorp.c       |   3 +-<br>
</span> src/intel/vulkan/anv_private.h     |  13 ++--<br>
 src/intel/vulkan/genX_cmd_<wbr>buffer.c | 120 ++++++++++++++++++++++++++++++<wbr>+++----<br>
 3 files changed, 120 insertions(+), 16 deletions(-)<br>
<br>
diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c<br>
index e9b2ccbbdf..ba34cec0bd 100644<br>
--- a/src/intel/vulkan/anv_blorp.c<br>
+++ b/src/intel/vulkan/anv_blorp.c<br>
@@ -1628,7 +1628,8 @@ anv_ccs_resolve(struct anv_cmd_buffer * const cmd_buffer,<br>
<span class="">       return;<br>
<br>
    struct blorp_batch batch;<br>
-   blorp_batch_init(&cmd_buffer-><wbr>device->blorp, &batch, cmd_buffer, 0);<br>
+   blorp_batch_init(&cmd_buffer-><wbr>device->blorp, &batch, cmd_buffer,<br>
+                    BLORP_BATCH_PREDICATE_ENABLE);<br>
<br>
    struct blorp_surf surf;<br>
    get_blorp_surf_for_anv_image(<wbr>image, VK_IMAGE_ASPECT_COLOR_BIT,<br>
diff --git a/src/intel/vulkan/anv_<wbr>private.h b/src/intel/vulkan/anv_<wbr>private.h<br>
</span>index e7b47ead36..588bf732df 100644<br>
--- a/src/intel/vulkan/anv_<wbr>private.h<br>
+++ b/src/intel/vulkan/anv_<wbr>private.h<br>
@@ -2090,11 +2090,16 @@ anv_fast_clear_state_entry_<wbr>size(const struct anv_device *device)<br>
<span class=""> {<br>
    assert(device);<br>
    /* Entry contents:<br>
-    *   +----------------------+<br>
-    *   | clear value dword(s) |<br>
-    *   +----------------------+<br>
+    *   +-----------------------------<wbr>---------------+<br>
+    *   | clear value dword(s) | needs resolve dword |<br>
+    *   +-----------------------------<wbr>---------------+<br>
     */<br>
-   return device->isl_dev.ss.clear_<wbr>value_size;<br>
+<br>
+   /* Ensure that the needs resolve dword is in fact dword-aligned to enable<br>
+    * GPU memcpy operations.<br>
+    */<br>
+   assert(device->isl_dev.ss.<wbr>clear_value_size % 4 == 0);<br>
+   return device->isl_dev.ss.clear_<wbr>value_size + 4;<br>
 }<br>
<br>
 /* Returns true if a HiZ-enabled depth buffer can be sampled from. */<br>
diff --git a/src/intel/vulkan/genX_cmd_<wbr>buffer.c b/src/intel/vulkan/genX_cmd_<wbr>buffer.c<br>
</span>index 611e77bddb..c4d67fe8c1 100644<br>
--- a/src/intel/vulkan/genX_cmd_<wbr>buffer.c<br>
+++ b/src/intel/vulkan/genX_cmd_<wbr>buffer.c<br>
@@ -407,21 +407,92 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,<br>
       anv_gen8_hiz_op_resolve(cmd_<wbr>buffer, image, hiz_op);<br>
 }<br>
<br>
+enum fast_clear_state_field {<br>
+   FAST_CLEAR_STATE_FIELD_CLEAR,<br>
+   FAST_CLEAR_STATE_FIELD_<wbr>RESOLVE,<br></blockquote><div><br></div><div>Mind calling these CLEAR_COLOR and NEEDS_RESOLVE?  Otherwise, it's not clear what you're fetching.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+};<br>
+<br>
 static inline uint32_t<br>
-get_fast_clear_state_entry_<wbr>offset(const struct anv_device *device,<br>
-                                  const struct anv_image *image,<br>
-                                  unsigned level)<br>
+get_fast_clear_state_offset(<wbr>const struct anv_device *device,<br>
<span class="">+                            const struct anv_image *image,<br>
</span>+                            unsigned level, enum fast_clear_state_field field)<br>
 {<br>
    assert(device && image);<br>
<span class="">    assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);<br>
    assert(level < anv_image_aux_levels(image));<br>
</span>-   const uint32_t offset = image->offset + image->aux_surface.offset +<br>
-                           image->aux_surface.isl.size +<br>
-                           anv_fast_clear_state_entry_<wbr>size(device) * level;<br>
+   uint32_t offset = image->offset + image->aux_surface.offset +<br>
+                     image->aux_surface.isl.size +<br>
+                     anv_fast_clear_state_entry_<wbr>size(device) * level;<br>
+<br>
+   switch (field) {<br>
+   case FAST_CLEAR_STATE_FIELD_<wbr>RESOLVE:<br>
+      offset += device->isl_dev.ss.clear_<wbr>value_size;<br>
+      /* Fall-through */<br>
+   case FAST_CLEAR_STATE_FIELD_CLEAR:<br>
+      break;<br>
+   }<br>
+<br>
    assert(offset < image->offset + image->size);<br>
<span class="">    return offset;<br>
 }<br>
<br>
+#define MI_PREDICATE_SRC0  0x2400<br>
+#define MI_PREDICATE_SRC1  0x2408<br>
+<br>
</span><span class="">+/* Manages the state of an color image subresource to ensure resolves are<br>
+ * performed properly.<br>
+ */<br>
+static void<br>
</span>+genX(set_image_needs_resolve)<wbr>(struct anv_cmd_buffer *cmd_buffer,<br>
<span class="">+                        const struct anv_image *image,<br>
</span>+                        unsigned level, bool needs_resolve)<br>
<span class="">+{<br>
+   assert(cmd_buffer && image);<br>
+   assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);<br>
+   assert(level < anv_image_aux_levels(image));<br>
+<br>
+   const uint32_t resolve_flag_offset =<br>
</span>+      get_fast_clear_state_offset(<wbr>cmd_buffer->device, image, level,<br>
+                                  FAST_CLEAR_STATE_FIELD_<wbr>RESOLVE);<br>
+<br>
<span class="">+   /* The HW docs say that there is no way to guarantee the completion of<br>
+    * the following command. We use it nevertheless because it shows no<br>
+    * issues in testing is currently being used in the GL driver.<br>
+    */<br>
+   anv_batch_emit(&cmd_buffer-><wbr>batch, GENX(MI_STORE_DATA_IMM), sdi) {<br>
+      sdi.Address = (struct anv_address) { image->bo, resolve_flag_offset };<br>
</span>+      sdi.ImmediateData = needs_resolve;<br>
+   }<br>
+}<br>
+<br>
+static void<br>
+genX(load_needs_resolve_<wbr>predicate)(struct anv_cmd_buffer *cmd_buffer,<br>
<span class="">+                                   const struct anv_image *image,<br>
</span>+                                   unsigned level)<br>
<span class="">+{<br>
+   assert(cmd_buffer && image);<br>
+   assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);<br>
+   assert(level < anv_image_aux_levels(image));<br>
+<br>
+   const uint32_t resolve_flag_offset =<br>
</span>+      get_fast_clear_state_offset(<wbr>cmd_buffer->device, image, level,<br>
+                                  FAST_CLEAR_STATE_FIELD_<wbr>RESOLVE);<br>
+<br>
<span class="">+   /* Make the pending predicated resolve a no-op if one is not needed.<br>
+    * predicate = do_resolve = resolve_flag != 0;<br>
+    */<br>
+   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1    , 0);<br>
+   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0);<br>
+   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0    , 0);<br>
+   emit_lrm(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4,<br>
+            image->bo, resolve_flag_offset);<br>
+   anv_batch_emit(&cmd_buffer-><wbr>batch, GENX(MI_PREDICATE), mip) {<br>
+      mip.LoadOperation    = LOAD_LOADINV;<br>
+      mip.CombineOperation = COMBINE_SET;<br>
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;<br>
+   }<br>
+}<br></span></blockquote><div><br></div><div>Thanks for splitting these up.  Much easier to read!<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span class="">
</span><span class="">+<br>
 static void<br>
 init_fast_clear_state_entry(<wbr>struct anv_cmd_buffer *cmd_buffer,<br>
                             const struct anv_image *image,<br>
</span>@@ -431,6 +502,15 @@ init_fast_clear_state_entry(<wbr>struct anv_cmd_buffer *cmd_buffer,<br>
<span class="">    assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);<br>
    assert(level < anv_image_aux_levels(image));<br>
<br>
+   /* The resolve flag should updated to signify that fast-clear/compression<br>
+    * data needs to be removed when leaving the undefined layout. Such data<br>
+    * may need to be removed if it would cause accesses to the color buffer<br>
+    * to return incorrect data. The fast clear data in CCS_D buffers should<br>
+    * be removed because CCS_D isn't enabled all the time.<br>
+    */<br>
</span>+   genX(set_image_needs_resolve)(<wbr>cmd_buffer, image, level,<br>
+                                 image->aux_usage == ISL_AUX_USAGE_NONE);<br>
<span class="">+<br>
    /* The fast clear value dword(s) will be copied into a surface state object.<br>
     * Ensure that the restrictions of the fields in the dword(s) are followed.<br>
     *<br>
</span>@@ -446,7 +526,8 @@ init_fast_clear_state_entry(<wbr>struct anv_cmd_buffer *cmd_buffer,<br>
    for (; i < cmd_buffer->device->isl_dev.<wbr>ss.clear_value_size; i += 4) {<br>
       anv_batch_emit(&cmd_buffer-><wbr>batch, GENX(MI_STORE_DATA_IMM), sdi) {<br>
          const uint32_t entry_offset =<br>
-            get_fast_clear_state_entry_<wbr>offset(cmd_buffer->device, image, level);<br>
+            get_fast_clear_state_offset(<wbr>cmd_buffer->device, image, level,<br>
+                                        FAST_CLEAR_STATE_FIELD_CLEAR);<br>
          sdi.Address = (struct anv_address) { image->bo, entry_offset + i };<br>
<br>
          if (GEN_GEN >= 9) {<br>
@@ -493,7 +574,8 @@ genX(copy_fast_clear_dwords)(<wbr>struct anv_cmd_buffer *cmd_buffer,<br>
    uint32_t ss_clear_offset = surface_state.offset +<br>
       cmd_buffer->device->isl_dev.<wbr>ss.clear_value_offset;<br>
    uint32_t entry_offset =<br>
-      get_fast_clear_state_entry_<wbr>offset(cmd_buffer->device, image, level);<br>
+      get_fast_clear_state_offset(<wbr>cmd_buffer->device, image, level,<br>
+                                  FAST_CLEAR_STATE_FIELD_CLEAR);<br>
    unsigned copy_size = cmd_buffer->device->isl_dev.<wbr>ss.clear_value_size;<br>
<br>
    if (copy_from_surface_state) {<br>
@@ -670,6 +752,8 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,<br>
<span class="">          layer_count = MIN2(layer_count, anv_image_aux_layers(image, level));<br>
       }<br>
<br>
</span>+      genX(load_needs_resolve_<wbr>predicate)(cmd_buffer, image, level);<br>
<span class="">+<br>
       /* Create a surface state with the right clear color and perform the<br>
        * resolve.<br>
        */<br>
</span>@@ -701,6 +785,8 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,<br>
<span class="">                       image->aux_usage == ISL_AUX_USAGE_CCS_E ?<br>
                       BLORP_FAST_CLEAR_OP_RESOLVE_<wbr>PARTIAL :<br>
                       BLORP_FAST_CLEAR_OP_RESOLVE_<wbr>FULL);<br>
+<br>
</span>+      genX(set_image_needs_resolve)(<wbr>cmd_buffer, image, level, false);<br>
    }<br>
<br>
    cmd_buffer->state.pending_<wbr>pipe_bits |=<br>
@@ -2450,9 +2536,6 @@ void genX(CmdDispatch)(<br>
<span class=""> #define GPGPU_DISPATCHDIMY 0x2504<br>
 #define GPGPU_DISPATCHDIMZ 0x2508<br>
<br>
-#define MI_PREDICATE_SRC0  0x2400<br>
-#define MI_PREDICATE_SRC1  0x2408<br>
-<br>
 void genX(CmdDispatchIndirect)(<br>
     VkCommandBuffer                             commandBuffer,<br>
     VkBuffer                                    _buffer,<br>
</span>@@ -2859,6 +2942,21 @@ cmd_buffer_subpass_sync_fast_<wbr>clear_values(struct anv_cmd_buffer *cmd_buffer)<br>
<span class="">          genX(copy_fast_clear_dwords)(<wbr>cmd_buffer, att_state->color_rt_state,<br>
                                       iview->image, iview->isl.base_level,<br>
                                       true /* copy from ss */);<br>
+<br>
+         /* Fast-clears impact whether or not a resolve will be necessary. */<br>
+         if (iview->image->aux_usage == ISL_AUX_USAGE_CCS_E &&<br>
+             att_state->clear_color_is_<wbr>zero) {<br>
+            /* This image always has the auxiliary buffer enabled. We can mark<br>
+             * the subresource as not needing a resolve because the clear color<br>
+             * will match what's in every RENDER_SURFACE_STATE object when it's<br>
+             * being used for sampling.<br>
+             */<br>
</span>+            genX(set_image_needs_resolve)(<wbr>cmd_buffer, iview->image,<br>
+                                          iview->isl.base_level, false);<br>
+         } else {<br>
+            genX(set_image_needs_resolve)(<wbr>cmd_buffer, iview->image,<br>
+                                          iview->isl.base_level, true);<br>
<span class="">+         }<br>
       } else if (rp_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {<br>
          /* The attachment may have been fast-cleared in a previous render<br>
           * pass and the value is needed now. Update the surface state(s).<br>
--<br>
</span>2.13.3<br>
<div class="HOEnZb"><div class="h5"><br>
______________________________<wbr>_________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
</div></div></blockquote></div><br></div></div>