[Mesa-dev] [PATCH 8/9] intel/blorp: Handle fast-clear directly in blorp_ccs_op

Tue Feb 20 23:15:26 UTC 2018

---
 src/intel/blorp/blorp_clear.c | 199 +++++++++++++++++++-----------------------
 1 file changed, 88 insertions(+), 111 deletions(-)

diff --git a/src/intel/blorp/blorp_clear.c b/src/intel/blorp/blorp_clear.c
index 17d47a1..8d729a2 100644
--- a/src/intel/blorp/blorp_clear.c
+++ b/src/intel/blorp/blorp_clear.c
@@ -193,104 +193,7 @@ get_fast_clear_rect(const struct isl_device *dev,
 
    /* Only single sampled surfaces need to (and actually can) be resolved. */
    if (aux_surf->usage == ISL_SURF_USAGE_CCS_BIT) {
-      /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
-       * Target(s)", beneath the "Fast Color Clear" bullet (p327):
-       *
-       *     Clear pass must have a clear rectangle that must follow
-       *     alignment rules in terms of pixels and lines as shown in the
-       *     table below. Further, the clear-rectangle height and width
-       *     must be multiple of the following dimensions. If the height
-       *     and width of the render target being cleared do not meet these
-       *     requirements, an MCS buffer can be created such that it
-       *     follows the requirement and covers the RT.
-       *
-       * The alignment size in the table that follows is related to the
-       * alignment size that is baked into the CCS surface format but with X
-       * alignment multiplied by 16 and Y alignment multiplied by 32.
-       */
-      x_align = isl_format_get_layout(aux_surf->format)->bw;
-      y_align = isl_format_get_layout(aux_surf->format)->bh;
-
-      x_align *= 16;
-
-      /* SKL+ line alignment requirement for Y-tiled are half those of the prior
-       * generations.
-       */
-      if (dev->info->gen >= 9)
-         y_align *= 16;
-      else
-         y_align *= 32;
-
-      /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
-       * Target(s)", beneath the "Fast Color Clear" bullet (p327):
-       *
-       *     In order to optimize the performance MCS buffer (when bound to
-       *     1X RT) clear similarly to MCS buffer clear for MSRT case,
-       *     clear rect is required to be scaled by the following factors
-       *     in the horizontal and vertical directions:
-       *
-       * The X and Y scale down factors in the table that follows are each
-       * equal to half the alignment value computed above.
-       */
-      x_scaledown = x_align / 2;
-      y_scaledown = y_align / 2;
-
-      if (ISL_DEV_IS_HASWELL(dev)) {
-         /* The following text was added in the Haswell PRM, "3D Media GPGPU
-          * Engine" >> "MCS Buffer for Render Target(s)" >> Table "Color Clear
-          * of Non-MultiSampler Render Target Restrictions":
-          *
-          *    "Clear rectangle must be aligned to two times the number of
-          *    pixels in the table shown below due to 16X16 hashing across the
-          *    slice."
-          *
-          * It has persisted in the documentation for all platforms up until
-          * Cannonlake and possibly even beyond.  However, we believe that it
-          * is only needed on Haswell.
-          *
-          * There are a couple possible explanations for this restriction:
-          *
-          * 1) If you assume that the hardware is writing to the CCS as
-          *    bytes, then the x/y_align computed above gives you an alignment
-          *    in the CCS of 8x8 bytes and, if 16x16 is needed for hashing, we
-          *    need to multiply by 2.
-          *
-          * 2) Haswell is a bit unique in that it's CCS tiling does not line
-          *    up with Y-tiling on a cache-line granularity.  Instead, it has
-          *    an extra bit of swizzling in bit 9.  Also, bit 6 swizzling
-          *    applies to the CCS on Haswell.  This means that Haswell CTS
-          *    does not match on a cache-line granularity but it does match on
-          *    a 2x2 cache line granularity.
-          *
-          * Clearly, the first explanation seems to follow documentation the
-          * best but they may be related.  In any case, empirical evidence
-          * seems to confirm that it is, indeed required on Haswell.
-          *
-          * On Broadwell things get a bit stickier.  Broadwell adds support
-          * for mip-mapped CCS with an alignment in the CCS of 256x128.  For a
-          * 32bpb main surface, the above computation will yield a x/y_align
-          * of 128x128 for a Y-tiled main surface and 256x64 for X-tiled.  In
-          * either case, if we double the alignment, we will get an alignment
-          * bigger than horizontal and vertical alignment of the CCS and fast
-          * clears of one LOD may leak into others.
-          *
-          * Starting with Skylake, the image alignment for the CCS is only
-          * 128x64 which is exactly the x/h_align computed above if the main
-          * surface has a 32bpb format.  Also, the "Render Target Resolve"
-          * page in the bspec (not the PRM) says, "The Resolve Rectangle size
-          * is same as Clear Rectangle size from SKL+".  The x/y_align
-          * computed above (without doubling) match the resolve rectangle
-          * calculation perfectly.
-          *
-          * Finally, to confirm all this, a full test run was performed on
-          * Feb. 9, 2018 with this doubling removed and the only platform
-          * which seemed to be affected was Haswell.  The run consisted of
-          * piglit, dEQP, the Vulkan CTS 1.0.2, the OpenGL 4.5 CTS, and the
-          * OpenGL ES 3.2 CTS.
-          */
-         x_align *= 2;
-         y_align *= 2;
-      }
+      unreachable("This function only supports MCS fast-clear");
    } else {
       assert(aux_surf->usage == ISL_SURF_USAGE_MCS_BIT);
 
@@ -826,14 +729,8 @@ blorp_ccs_op(struct blorp_batch *batch,
              enum isl_format format,
              enum isl_aux_op ccs_op)
 {
-   if (ccs_op == ISL_AUX_OP_FAST_CLEAR) {
-      blorp_fast_clear(batch, surf, format, level, start_layer, num_layers,
-                       0, 0,
-                       minify(surf->surf->logical_level0_px.w, level),
-                       minify(surf->surf->logical_level0_px.h, level));
-      return;
-   } else if (ISL_DEV_GEN(batch->blorp->isl_dev) < 10 &&
-              ccs_op == ISL_AUX_OP_AMBIGUATE) {
+   if (ISL_DEV_GEN(batch->blorp->isl_dev) < 10 &&
+       ccs_op == ISL_AUX_OP_AMBIGUATE) {
       /* Prior to Cannonlake, the ambiguate is not available as a hardware
        * operation.  Instead, we have to fake it by carefully binding the CCS
        * as a render target and clearing it to 0.  We leave that complicated
@@ -863,6 +760,11 @@ blorp_ccs_op(struct blorp_batch *batch,
       isl_format_get_layout(params.dst.aux_surf.format);
    assert(aux_fmtl->txc == ISL_TXC_CCS);
 
+   /* The PRM Sections entitled "Fast Color Clear" and "Render Target Resolve"
+    * contain tables for the scale down factor for fast clear and resolve
+    * rectangles.  The values in those tables are easily computed from the
+    * CCS element block size.
+    */
    unsigned x_scaledown, y_scaledown;
    if (ISL_DEV_GEN(batch->blorp->isl_dev) >= 9) {
       x_scaledown = aux_fmtl->bw * 8;
@@ -871,16 +773,91 @@ blorp_ccs_op(struct blorp_batch *batch,
       x_scaledown = aux_fmtl->bw * 8;
       y_scaledown = aux_fmtl->bh * 16;
    } else {
-      x_scaledown = aux_fmtl->bw / 2;
-      y_scaledown = aux_fmtl->bh / 2;
+      assert(ISL_DEV_GEN(batch->blorp->isl_dev) == 7);
+      if (ccs_op == ISL_AUX_OP_FAST_CLEAR) {
+         x_scaledown = aux_fmtl->bw * 8;
+         y_scaledown = aux_fmtl->bh * 16;
+      } else {
+         x_scaledown = aux_fmtl->bw / 2;
+         y_scaledown = aux_fmtl->bh / 2;
+      }
    }
+
+   unsigned x_align = x_scaledown;
+   unsigned y_align = y_scaledown;
+   if (ccs_op == ISL_AUX_OP_FAST_CLEAR) {
+      /* The PRM Section entitled "Fast Color Clear" contains tables for the
+       * scaledown and alignment factors for fast clear operations.  In all
+       * cases, the alignment table is 2x the scaledown table.
+       */
+      x_align *= 2;
+      y_align *= 2;
+
+      if (ISL_DEV_IS_HASWELL(batch->blorp->isl_dev)) {
+         /* The following text was added in the Haswell PRM, "3D Media GPGPU
+          * Engine" >> "MCS Buffer for Render Target(s)" >> Table "Color Clear
+          * of Non-MultiSampler Render Target Restrictions":
+          *
+          *    "Clear rectangle must be aligned to two times the number of
+          *    pixels in the table shown below due to 16X16 hashing across the
+          *    slice."
+          *
+          * It has persisted in the documentation for all platforms up until
+          * Cannonlake and possibly even beyond.  However, we believe that it
+          * is only needed on Haswell.
+          *
+          * There are a couple possible explanations for this restriction:
+          *
+          * 1) If you assume that the hardware is writing to the CCS as
+          *    bytes, then the x/y_align computed above gives you an alignment
+          *    in the CCS of 8x8 bytes and, if 16x16 is needed for hashing, we
+          *    need to multiply by 2.
+          *
+          * 2) Haswell is a bit unique in that it's CCS tiling does not line
+          *    up with Y-tiling on a cache-line granularity.  Instead, it has
+          *    an extra bit of swizzling in bit 9.  Also, bit 6 swizzling
+          *    applies to the CCS on Haswell.  This means that Haswell CTS
+          *    does not match on a cache-line granularity but it does match on
+          *    a 2x2 cache line granularity.
+          *
+          * Clearly, the first explanation seems to follow documentation the
+          * best but they may be related.  In any case, empirical evidence
+          * seems to confirm that it is, indeed required on Haswell.
+          *
+          * On Broadwell things get a bit stickier.  Broadwell adds support
+          * for mip-mapped CCS with an alignment in the CCS of 256x128.  For a
+          * 32bpb main surface, the above computation will yield a x/y_align
+          * of 128x128 for a Y-tiled main surface and 256x64 for X-tiled.  In
+          * either case, if we double the alignment, we will get an alignment
+          * bigger than horizontal and vertical alignment of the CCS and fast
+          * clears of one LOD may leak into others.
+          *
+          * Starting with Skylake, the image alignment for the CCS is only
+          * 128x64 which is exactly the x/h_align computed above if the main
+          * surface has a 32bpb format.  Also, the "Render Target Resolve"
+          * page in the bspec (not the PRM) says, "The Resolve Rectangle size
+          * is same as Clear Rectangle size from SKL+".  The x/y_align
+          * computed above (without doubling) match the resolve rectangle
+          * calculation perfectly.
+          *
+          * Finally, to confirm all this, a full test run was performed on
+          * Feb. 9, 2018 with this doubling removed and the only platform
+          * which seemed to be affected was Haswell.  The run consisted of
+          * piglit, dEQP, the Vulkan CTS 1.0.2, the OpenGL 4.5 CTS, and the
+          * OpenGL ES 3.2 CTS.
+          */
+         x_align *= 2;
+         y_align *= 2;
+      }
+   }
+
    params.x0 = params.y0 = 0;
    params.x1 = minify(params.dst.aux_surf.logical_level0_px.width, level);
    params.y1 = minify(params.dst.aux_surf.logical_level0_px.height, level);
-   params.x1 = ALIGN(params.x1, x_scaledown) / x_scaledown;
-   params.y1 = ALIGN(params.y1, y_scaledown) / y_scaledown;
+   params.x1 = ALIGN(params.x1, x_align) / x_scaledown;
+   params.y1 = ALIGN(params.y1, y_align) / y_scaledown;
 
-   assert(ccs_op != ISL_AUX_OP_FAST_CLEAR);
+   memset(&params.wm_inputs.clear_color, 0xff, 4*sizeof(float));
    params.fast_clear_op = ccs_op;
    params.num_layers = num_layers;
 
-- 
2.5.0.400.gff86faf