[Mesa-dev] [PATCH 1/2] i965/blorp: Add bilinear filtering of samples for multisample scaled blits

Tue Jun 4 18:20:41 PDT 2013

Current implementation of ext_framebuffer_multisample_blit_scaled in
i965/blorp uses nearest filtering for multisample scaled blits. Using
nearest filtering produces blocky artifacts and negates the benefits
of MSAA. That is the reason why extension was not enabled on i965.

This patch implements the bilinear filtering of samples in blorp
engine. Images generated with this patch are free from blocky
artifacts and show big improvement in visual quality.

Observed no piglit and gles3 regressions.
---
 src/mesa/drivers/dri/i965/brw_blorp.h        |   3 +
 src/mesa/drivers/dri/i965/brw_blorp_blit.cpp | 157 ++++++++++++++++++++++++++-
 2 files changed, 155 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h
index 51b23db..a40324b 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -305,6 +305,9 @@ struct brw_blorp_blit_prog_key
     * than one sample per pixel.
     */
    bool persample_msaa_dispatch;
+
+   /* True for scaled blitting. */
+   bool blit_scaled;
 };
 
 class brw_blorp_blit_params : public brw_blorp_params
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index a6b2bbf..46aa6d4 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -619,7 +619,8 @@ private:
    void kill_if_outside_dst_rect();
    void translate_dst_to_src();
    void single_to_blend();
-   void manual_blend(unsigned num_samples);
+   void manual_blend_average(unsigned num_samples);
+   void manual_blend_linear(unsigned num_samples);
    void sample(struct brw_reg dst);
    void texel_fetch(struct brw_reg dst);
    void mcs_fetch();
@@ -630,7 +631,7 @@ private:
    /**
     * Base-2 logarithm of the maximum number of samples that can be blended.
     */
-   static const unsigned LOG2_MAX_BLEND_SAMPLES = 3;
+   static const unsigned LOG2_MAX_BLEND_SAMPLES = 7;
 
    void *mem_ctx;
    struct brw_context *brw;
@@ -673,6 +674,9 @@ private:
     */
    struct brw_reg y_coords[2];
 
+   struct brw_reg x_coord_fract;
+   struct brw_reg y_coord_fract;
+
    /* Which element of x_coords and y_coords is currently in use.
     */
    int xy_coord_index;
@@ -811,15 +815,17 @@ brw_blorp_blit_program::compile(struct brw_context *brw,
     * that we want to texture from.  Exception: if we are blending, then S is
     * irrelevant, because we are going to fetch all samples.
     */
-   if (key->blend) {
+   if (key->blend && !key->blit_scaled) {
       if (brw->intel.gen == 6) {
          /* Gen6 hardware an automatically blend using the SAMPLE message */
          single_to_blend();
          sample(texture_data[0]);
       } else {
          /* Gen7+ hardware doesn't automaticaly blend. */
-         manual_blend(key->src_samples);
+         manual_blend_average(key->src_samples);
       }
+   } else if(key->blend && key->blit_scaled) {
+      manual_blend_linear(key->src_samples);
    } else {
       /* We aren't blending, which means we just want to fetch a single sample
        * from the source surface.  The address that we want to fetch from is
@@ -910,6 +916,13 @@ brw_blorp_blit_program::alloc_regs()
          = retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD);
       reg += 2;
    }
+   this->x_coord_fract
+      = vec8(retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_F));
+   reg += 2;
+   this->y_coord_fract
+      = vec8(retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_F));
+   reg += 2;
+
    this->xy_coord_index = 0;
    this->sample_index
       = retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD);
@@ -1365,6 +1378,10 @@ brw_blorp_blit_program::translate_dst_to_src()
    brw_MUL(&func, Y_f, Yp_f, y_transform.multiplier);
    brw_ADD(&func, X_f, X_f, x_transform.offset);
    brw_ADD(&func, Y_f, Y_f, y_transform.offset);
+   if (key->blit_scaled && key->blend) {
+      brw_FRC(&func, x_coord_fract, X_f);
+      brw_FRC(&func, y_coord_fract, Y_f);
+   }
    /* Round the float coordinates down to nearest integer by moving to
     * UD registers.
     */
@@ -1415,7 +1432,7 @@ inline int count_trailing_one_bits(unsigned value)
 
 
 void
-brw_blorp_blit_program::manual_blend(unsigned num_samples)
+brw_blorp_blit_program::manual_blend_average(unsigned num_samples)
 {
    if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
       mcs_fetch();
@@ -1520,6 +1537,131 @@ brw_blorp_blit_program::manual_blend(unsigned num_samples)
       brw_ENDIF(&func);
 }
 
+void
+brw_blorp_blit_program::manual_blend_linear(unsigned num_samples)
+{
+   if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
+      mcs_fetch();
+
+   /* e.g. for 4x MSAA:
+    *   result = lrp(lrp(sample[0] + sample[1]) + lrp(sample[2] + sample[3]))
+    *
+    * We do this computation by performing the following operations:
+    *
+    * In case of 4x MSAA:
+    * - Compute samples 0 to 3
+    * - linearly interpolate samples 0 and 1 in X
+    * - linearly interpolate samples 2 and 3 in X
+    * - linearly interpolate the results of last two operations in Y
+    *
+    * In case of 8x MSAA, we take the average of two samples in each quarter
+    * of the pixel to generate 4 sample values. These 4 sample values are used
+    * in a way similar to 4x MSAA:
+    * - Compute samples 0 to 7
+    * - Compute avearge of samples 0 and 7 and leave the result in sample 0
+    * - Compute average of samples 1 and 3 and leave the result in sample 1
+    * - Compute average of samples 2 and 6 and leave the result in sample 2
+    * - Compute average of samples 4 and 5 and leave the result in sample 3
+    * - linearly interpolate samples 0 and 1 in X
+    * - linearly interpolate samples 2 and 3 in X
+    * - linearly interpolate the results of last two operations in Y
+    *
+    */
+   for (unsigned i = 0; i < num_samples; ++i) {
+      assert(i < ARRAY_SIZE(texture_data));
+      if (i == 0) {
+         s_is_zero = true;
+      } else {
+         s_is_zero = false;
+         brw_MOV(&func, vec16(S), brw_imm_ud(i));
+      }
+      texel_fetch(texture_data[i]);
+
+      if (i == 0 && key->tex_layout == INTEL_MSAA_LAYOUT_CMS) {
+         /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
+          * suggests an optimization:
+          *
+          *     "A simple optimization with probable large return in
+          *     performance is to compare the MCS value to zero (indicating
+          *     all samples are on sample slice 0), and sample only from
+          *     sample slice 0 using ld2dss if MCS is zero."
+          *
+          * Note that in the case where the MCS value is zero, sampling from
+          * sample slice 0 using ld2dss and sampling from sample 0 using
+          * ld2dms are equivalent (since all samples are on sample slice 0).
+          * Since we have already sampled from sample 0, all we need to do is
+          * skip the remaining fetches and averaging if MCS is zero.
+          */
+         brw_CMP(&func, vec16(brw_null_reg()), BRW_CONDITIONAL_NZ,
+                 mcs_data, brw_imm_ud(0));
+         brw_IF(&func, BRW_EXECUTE_16);
+      }
+   }
+
+#define SAMPLE(x, y) offset(texture_data[x], y)
+#define COMBINE(brw_op, dst, src0, src1)\
+   for (int k = 0; k < 4; ++k)\
+      brw_op(&func, dst, src0, src1);
+
+   if (num_samples == 8) {
+      COMBINE(brw_ADD, SAMPLE(0, 2*k),
+              vec8(SAMPLE(0, 2*k)),
+              vec8(SAMPLE(7, 2*k)));
+      COMBINE(brw_MUL, SAMPLE(0, 2*k),
+              vec8(SAMPLE(0, 2*k)),
+              brw_imm_f(0.5));
+      COMBINE(brw_ADD, SAMPLE(1, 2*k),
+              vec8(SAMPLE(1, 2*k)),
+              vec8(SAMPLE(3, 2*k)));
+      COMBINE(brw_MUL, SAMPLE(1, 2*k),
+              vec8(SAMPLE(1, 2*k)),
+              brw_imm_f(0.5));
+      COMBINE(brw_ADD, SAMPLE(2, 2*k),
+              vec8(SAMPLE(2, 2*k)),
+              vec8(SAMPLE(6, 2*k)));
+      COMBINE(brw_MUL, SAMPLE(2, 2*k),
+              vec8(SAMPLE(2, 2*k)),
+              brw_imm_f(0.5));
+      COMBINE(brw_ADD, SAMPLE(3, 2*k),
+              vec8(SAMPLE(4, 2*k)),
+              vec8(SAMPLE(5, 2*k)));
+      COMBINE(brw_MUL, SAMPLE(3, 2*k),
+              vec8(SAMPLE(3, 2*k)),
+              brw_imm_f(0.5));
+
+      /* Shuffle sample registers stored in texture_data array to match with
+       * the layout of samples in 4x MSAA.
+       */
+      t1 = texture_data[0];
+      t2 = texture_data[1];
+      texture_data[0] = texture_data[3];
+      texture_data[1] = texture_data[2];
+      texture_data[2] = t1;
+      texture_data[3] = t2;
+   }
+
+   brw_set_access_mode(&func, BRW_ALIGN_16);
+   for (int index = 3; index > 0; ) {
+      for (int k = 0; k < 8; ++k)
+         brw_LRP(&func,
+                 vec8(SAMPLE(index - 1, k)),
+                 offset(x_coord_fract, k & 1),
+                 SAMPLE(index, k),
+                 SAMPLE(index - 1, k));
+      index -= 2;
+   }
+   for (int k = 0; k < 8; ++k)
+      brw_LRP(&func,
+              vec8(SAMPLE(0, k)),
+              offset(y_coord_fract, k & 1),
+              vec8(SAMPLE(2, k)),
+              vec8(SAMPLE(0, k)));
+   brw_set_access_mode(&func, BRW_ALIGN_1);
+#undef SAMPLE
+   if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
+      brw_ENDIF(&func);
+}
+
 /**
  * Emit code to look up a value in the texture using the SAMPLE message (which
  * does blending of MSAA surfaces).
@@ -1869,6 +2011,11 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
       wm_prog_key.persample_msaa_dispatch = true;
    }
 
+   /* Scaled blitting or not. */
+   wm_prog_key.blit_scaled =
+      ((dst_x1 - dst_x0) == (src_x1 - src_x0) &&
+       (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true;
+
    /* The render path must be configured to use the same number of samples as
     * the destination buffer.
     */
-- 
1.8.1.4