<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Thu, Apr 27, 2017 at 11:32 AM, Nanley Chery <span dir="ltr"><<a href="mailto:nanleychery@gmail.com" target="_blank">nanleychery@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">There's no image layout to represent a full-RT-cleared color attachment.<br>
That's one reason we can end up with redundant resolves. Testing has<br>
shown that such resolves can measurably hurt performance and that<br>
predicating them can avoid the penalty.<br>
<br>
Signed-off-by: Nanley Chery <<a href="mailto:nanley.g.chery@intel.com">nanley.g.chery@intel.com</a>><br>
---<br>
 src/intel/vulkan/anv_blorp.c       |   3 +-<br>
 src/intel/vulkan/anv_image.c       |   6 +++<br>
 src/intel/vulkan/genX_cmd_<wbr>buffer.c | 106 ++++++++++++++++++++++++++++++<wbr>++++---<br>
 3 files changed, 108 insertions(+), 7 deletions(-)<br>
<br>
diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c<br>
index 821d01a077..32f0edf316 100644<br>
--- a/src/intel/vulkan/anv_blorp.c<br>
+++ b/src/intel/vulkan/anv_blorp.c<br>
@@ -1500,7 +1500,8 @@ anv_ccs_resolve(struct anv_cmd_buffer * const cmd_buffer,<br>
<br>
    struct blorp_batch batch;<br>
    blorp_batch_init(&cmd_buffer-><wbr>device->blorp, &batch, cmd_buffer,<br>
-                    BLORP_BATCH_NO_EMIT_DEPTH_<wbr>STENCIL);<br>
+                    BLORP_BATCH_NO_EMIT_DEPTH_<wbr>STENCIL |<br>
+                    BLORP_BATCH_PREDICATE_ENABLE);<br>
<br>
    struct blorp_surf surf;<br>
    get_blorp_surf_for_anv_image(<wbr>image, VK_IMAGE_ASPECT_COLOR_BIT,<br>
diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c<br>
index 751f2d6026..92ee86dab5 100644<br>
--- a/src/intel/vulkan/anv_image.c<br>
+++ b/src/intel/vulkan/anv_image.c<br>
@@ -208,6 +208,12 @@ anv_fill_ccs_resolve_ss(const struct anv_device * const device,<br>
                        .aux_usage = image->aux_usage == ISL_AUX_USAGE_NONE ?<br>
                                     ISL_AUX_USAGE_CCS_D : image->aux_usage,<br>
                        .mocs = device->default_mocs);<br>
+<br>
+   /* The following dword is used as a flag to represent whether or not this<br>
+    * CCS subresource needs resolving. We want this to be zero by default,<br>
+    * which means that a resolve is not necessary.<br>
+    */<br>
+   assert(*(uint32_t *)(data + device->isl_dev.ss.addr_<wbr>offset) == 0);<br></blockquote><div><br></div><div>This seems like kind-of an odd place to put it... Not a problem, just odd.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
 }<br>
<br>
 /**<br>
diff --git a/src/intel/vulkan/genX_cmd_<wbr>buffer.c b/src/intel/vulkan/genX_cmd_<wbr>buffer.c<br>
index 95729ec8a8..041301290e 100644<br>
--- a/src/intel/vulkan/genX_cmd_<wbr>buffer.c<br>
+++ b/src/intel/vulkan/genX_cmd_<wbr>buffer.c<br>
@@ -402,6 +402,82 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,<br>
       anv_gen8_hiz_op_resolve(cmd_<wbr>buffer, image, hiz_op);<br>
 }<br>
<br>
+static uint32_t<br>
+clear_value_buffer_offset(<wbr>const struct anv_cmd_buffer * const cmd_buffer,<br>
+                          const struct anv_image * const image,<br>
+                          const uint8_t level)<br>
+{<br>
+   return image->offset +<br>
+          image->aux_surface.offset + image->aux_surface.isl.size +<br>
+          cmd_buffer->device->isl_dev.<wbr>ss.size * level;<br>
+}<br>
+<br>
+#define MI_PREDICATE_SRC0  0x2400<br>
+#define MI_PREDICATE_SRC1  0x2408<br>
+<br>
+enum ccs_resolve_state {<br>
+   CCS_RESOLVE_NOT_NEEDED,<br>
+   CCS_RESOLVE_NEEDED,<br>
+   CCS_RESOLVE_STARTING,<br>
+};<br>
+<br>
+/* Manages the state of an color image subresource to ensure resolves are<br>
+ * performed properly.<br>
+ */<br>
+static void<br>
+genX(set_resolve_state)(<wbr>struct anv_cmd_buffer * const cmd_buffer,<br>
+                        const struct anv_image * const image,<br>
+                        const uint8_t level,<br>
+                        const enum ccs_resolve_state state)<br>
+{<br>
+   assert(cmd_buffer && image);<br>
+<br>
+   /* The image subresource range must have a color auxiliary buffer. */<br>
+   assert(anv_image_has_color_<wbr>aux(image));<br>
+   assert(level < anv_color_aux_levels(image));<br>
+<br>
+   /* We store the resolve flag in the location of the surface base address<br>
+    * field for simplicity and because it is initialized to zero when the<br>
+    * clear value buffer is initialized.<br>
+    */<br>
+   const uint32_t resolve_flag_offset =<br>
+      clear_value_buffer_offset(cmd_<wbr>buffer, image, level) +<br>
+      cmd_buffer->device->isl_dev.<wbr>ss.addr_offset;<br>
+<br>
+   /* An update operation should not overwrite the fast clear value. */<br>
+   if (cmd_buffer->device->isl_dev.<wbr>ss.addr_offset <<br>
+       cmd_buffer->device->isl_dev.<wbr>ss.clear_value_offset) {<br>
+      assert(cmd_buffer->device-><wbr>isl_dev.ss.addr_offset + 4 <=<br>
+             cmd_buffer->device->isl_dev.<wbr>ss.clear_value_offset);<br>
+   }<br>
+<br>
+   if (state != CCS_RESOLVE_STARTING) {<br>
+      assert(state == CCS_RESOLVE_NEEDED || state == CCS_RESOLVE_NOT_NEEDED);<br>
+      /* The HW docs say that there is no way to guarantee the completion of<br>
+       * the following command. We use it nevertheless because it shows no<br>
+       * issues in testing is currently being used in the GL driver.<br>
+       */<br>
+      anv_batch_emit(&cmd_buffer-><wbr>batch, GENX(MI_STORE_DATA_IMM), sdi) {<br>
+         sdi.Address = (struct anv_address) { image->bo, resolve_flag_offset };<br>
+         sdi.ImmediateData = state == CCS_RESOLVE_NEEDED;<br>
+      }<br>
+   } else {<br>
+      /* Make the pending predicated resolve a no-op if one is not needed.<br>
+       * predicate = do_resolve = resolve_flag != 0;<br>
+       */<br>
+      emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1    , 0);<br>
+      emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0);<br>
+      emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0    , 0);<br>
+      emit_lrm(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4,<br>
+               image->bo, resolve_flag_offset);<br>
+      anv_batch_emit(&cmd_buffer-><wbr>batch, GENX(MI_PREDICATE), mip) {<br>
+         mip.LoadOperation    = LOAD_LOADINV;<br>
+         mip.CombineOperation = COMBINE_SET;<br>
+         mip.CompareOperation = COMPARE_SRCS_EQUAL;<br>
+      }<br></blockquote><div><br></div><div>I really wish the MI_PREDICATE were closer to the anv_ccs_resolve.  Maybe we could just inline this function?  If we had a helper to get the anv_address of the resolve flag, it wouldn't be bad at all.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+   }<br>
+}<br>
+<br>
<br>
 /* Copies clear value dwords between a surface state object and an image's<br>
  * clear value buffer.<br>
@@ -420,9 +496,7 @@ genX(transfer_clear_value)(<wbr>struct anv_cmd_buffer * const cmd_buffer,<br>
    assert(level < anv_color_aux_levels(image));<br>
<br>
    const uint32_t img_clear_offset =<br>
-      image->offset + image->aux_surface.offset +<br>
-      image->aux_surface.isl.size +<br>
-      cmd_buffer->device->isl_dev.<wbr>ss.size * level +<br>
+      clear_value_buffer_offset(cmd_<wbr>buffer, image, level) +<br>
       cmd_buffer->device->isl_dev.<wbr>ss.clear_value_offset;<br>
<br>
    struct anv_bo * const ss_bo =<br>
@@ -523,6 +597,8 @@ transition_color_buffer(struct anv_cmd_buffer * const cmd_buffer,<br>
<br>
    for (uint32_t level = base_level; level < base_level + level_count; level++) {<br>
<br>
+      genX(set_resolve_state)(cmd_<wbr>buffer, image, level, CCS_RESOLVE_STARTING);<br>
+<br>
       layer_count = MIN2(layer_count, anv_color_aux_layers(image, level));<br>
       for (uint32_t layer = base_layer; layer < base_layer + layer_count; layer++) {<br>
<br>
@@ -541,6 +617,8 @@ transition_color_buffer(struct anv_cmd_buffer * const cmd_buffer,<br>
                                     false);<br>
          anv_ccs_resolve(cmd_buffer, surface_state, image, level, layer, op);<br>
       }<br>
+<br>
+      genX(set_resolve_state)(cmd_<wbr>buffer, image, level, CCS_RESOLVE_NOT_NEEDED);<br>
    }<br>
<br>
    cmd_buffer->state.pending_<wbr>pipe_bits |=<br>
@@ -693,6 +771,25 @@ genX(cmd_buffer_setup_<wbr>attachments)(struct anv_cmd_buffer *cmd_buffer,<br>
                genX(transfer_clear_value)(<wbr>cmd_buffer,<br>
                   state->attachments[i].color_<wbr>rt_state, iview->image,<br>
                   iview->isl.base_level, true /* copy_to_buffer */);<br>
+<br>
+               /* If this image always has the auxiliary buffer enabled we can<br>
+                * mark the subresource as not needing a resolve if the clear<br>
+                * color will match what's in the RENDER_SURFACE_STATE object<br>
+                * when it's being used for sampling.<br>
+                */<br>
+               if (iview->image->aux_usage == ISL_AUX_USAGE_CCS_E &&<br>
+                   clear_color.u32[0] == 0 &&<br>
+                   clear_color.u32[1] == 0 &&<br>
+                   clear_color.u32[2] == 0 &&<br>
+                   clear_color.u32[3] == 0) {<br>
+                  genX(set_resolve_state)(cmd_<wbr>buffer, iview->image,<br>
+                                          iview->isl.base_level,<br>
+                                          CCS_RESOLVE_NOT_NEEDED);<br>
+               } else {<br>
+                  genX(set_resolve_state)(cmd_<wbr>buffer, iview->image,<br>
+                                          iview->isl.base_level,<br>
+                                          CCS_RESOLVE_NEEDED);<br>
+               }<br>
             } else if (att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD &&<br>
                        state->attachments[i].aux_<wbr>usage != ISL_AUX_USAGE_NONE) {<br>
                /* The attachment may have been fast-cleared in a previous<br>
@@ -2194,9 +2291,6 @@ void genX(CmdDispatch)(<br>
 #define GPGPU_DISPATCHDIMY 0x2504<br>
 #define GPGPU_DISPATCHDIMZ 0x2508<br>
<br>
-#define MI_PREDICATE_SRC0  0x2400<br>
-#define MI_PREDICATE_SRC1  0x2408<br>
-<br>
 void genX(CmdDispatchIndirect)(<br>
     VkCommandBuffer                             commandBuffer,<br>
     VkBuffer                                    _buffer,<br>
<span class="HOEnZb"><font color="#888888">--<br>
2.12.2<br>
<br>
______________________________<wbr>_________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div></div>