[Mesa-dev] [PATCH V2 3/4] intel: Add multisample scaled blitting in blorp engine

Wed May 29 10:06:49 PDT 2013

On Fri, May 24, 2013 at 12:01 PM, Paul Berry <stereotype441 at gmail.com> wrote:
> On 16 May 2013 11:44, Anuj Phogat <anuj.phogat at gmail.com> wrote:
>>
>> In traditional multisampled framebuffer rendering, color samples must be
>> explicitly resolved via BlitFramebuffer before doing the scaled blitting
>> of the framebuffer. So, scaled blitting of a multisample framebuffer
>> takes two separate calls to BlitFramebuffer.
>>
>> This patch implements the functionality of doing multisampled scaled
>> resolve using just one BlitFramebuffer call. Important changes involved
>> in this patch are listed below:
>>     - Use float registers to scale and offset texture coordinates.
>>     - Change offset computation to consider float coordinates.
>>     - Round the scaled coordinates down to nearest integer.
>>     - Modify src texture coordinates clipping to account for scaling..
>>     - Linear filter is not yet implemented in blorp. So, don't use
>>       blorp engine to do single sampled scaled blitting.
>>
>> Note: Observed no piglit regressions on sandybridge & ivybridge with
>> these changes.
>>
>> Signed-off-by: Anuj Phogat <anuj.phogat at gmail.com>
>> ---
>>  src/mesa/drivers/dri/i965/brw_blorp.h          |  23 ++--
>>  src/mesa/drivers/dri/i965/brw_blorp_blit.cpp   | 143
>> +++++++++++++++----------
>>  src/mesa/drivers/dri/i965/brw_reg.h            |   7 --
>>  src/mesa/drivers/dri/intel/intel_mipmap_tree.c |   2 +
>>  4 files changed, 102 insertions(+), 73 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h
>> b/src/mesa/drivers/dri/i965/brw_blorp.h
>> index 70e3933..a40324b 100644
>> --- a/src/mesa/drivers/dri/i965/brw_blorp.h
>> +++ b/src/mesa/drivers/dri/i965/brw_blorp.h
>> @@ -40,9 +40,10 @@ brw_blorp_blit_miptrees(struct intel_context *intel,
>>                          unsigned src_level, unsigned src_layer,
>>                          struct intel_mipmap_tree *dst_mt,
>>                          unsigned dst_level, unsigned dst_layer,
>> -                        int src_x0, int src_y0,
>> -                        int dst_x0, int dst_y0,
>> -                        int dst_x1, int dst_y1,
>> +                        float src_x0, float src_y0,
>> +                        float src_x1, float src_y1,
>> +                        float dst_x0, float dst_y0,
>> +                        float dst_x1, float dst_y1,
>>                          bool mirror_x, bool mirror_y);
>>
>>  bool
>> @@ -158,11 +159,11 @@ public:
>>
>>  struct brw_blorp_coord_transform_params
>>  {
>> -   void setup(GLuint src0, GLuint dst0, GLuint dst1,
>> +   void setup(GLfloat src0, GLfloat src1, GLfloat dst0, GLfloat dst1,
>>                bool mirror);
>>
>> -   int32_t multiplier;
>> -   int32_t offset;
>> +   float multiplier;
>> +   float offset;
>>  };
>>
>>
>> @@ -304,6 +305,9 @@ struct brw_blorp_blit_prog_key
>>      * than one sample per pixel.
>>      */
>>     bool persample_msaa_dispatch;
>> +
>> +   /* True for scaled blitting. */
>> +   bool blit_scaled;
>>  };
>>
>>  class brw_blorp_blit_params : public brw_blorp_params
>> @@ -314,9 +318,10 @@ public:
>>                           unsigned src_level, unsigned src_layer,
>>                           struct intel_mipmap_tree *dst_mt,
>>                           unsigned dst_level, unsigned dst_layer,
>> -                         GLuint src_x0, GLuint src_y0,
>> -                         GLuint dst_x0, GLuint dst_y0,
>> -                         GLuint width, GLuint height,
>> +                         GLfloat src_x0, GLfloat src_y0,
>> +                         GLfloat src_x1, GLfloat src_y1,
>> +                         GLfloat dst_x0, GLfloat dst_y0,
>> +                         GLfloat dst_x1, GLfloat dst_y1,
>>                           bool mirror_x, bool mirror_y);
>>
>>     virtual uint32_t get_wm_prog(struct brw_context *brw,
>> diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
>> b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
>> index b7ee92b..19169ef 100644
>> --- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
>> @@ -41,11 +41,11 @@
>>   * If coord0 > coord1, swap them and invert the "mirror" boolean.
>>   */
>>  static inline void
>> -fixup_mirroring(bool &mirror, GLint &coord0, GLint &coord1)
>> +fixup_mirroring(bool &mirror, GLfloat &coord0, GLfloat &coord1)
>>  {
>>     if (coord0 > coord1) {
>>        mirror = !mirror;
>> -      GLint tmp = coord0;
>> +      GLfloat tmp = coord0;
>>        coord0 = coord1;
>>        coord1 = tmp;
>>     }
>> @@ -67,9 +67,10 @@ fixup_mirroring(bool &mirror, GLint &coord0, GLint
>> &coord1)
>>   * coordinates, by swapping the roles of src and dst.
>>   */
>>  static inline bool
>> -clip_or_scissor(bool mirror, GLint &src_x0, GLint &src_x1, GLint &dst_x0,
>> -                GLint &dst_x1, GLint fb_xmin, GLint fb_xmax)
>> +clip_or_scissor(bool mirror, GLfloat &src_x0, GLfloat &src_x1, GLfloat
>> &dst_x0,
>> +                GLfloat &dst_x1, GLfloat fb_xmin, GLfloat fb_xmax)
>>  {
>> +   float scale = (float) (src_x1 - src_x0) / (dst_x1 - dst_x0);
>>     /* If we are going to scissor everything away, stop. */
>>     if (!(fb_xmin < fb_xmax &&
>>           dst_x0 < fb_xmax &&
>> @@ -105,8 +106,8 @@ clip_or_scissor(bool mirror, GLint &src_x0, GLint
>> &src_x1, GLint &dst_x0,
>>     /* Adjust the source rectangle to remove the pixels corresponding to
>> those
>>      * that were clipped/scissored out of the destination rectangle.
>>      */
>> -   src_x0 += pixels_clipped_left;
>> -   src_x1 -= pixels_clipped_right;
>> +   src_x0 += pixels_clipped_left * scale;
>> +   src_x1 -= pixels_clipped_right * scale;
>>
>>     return true;
>>  }
>> @@ -127,9 +128,10 @@ brw_blorp_blit_miptrees(struct intel_context *intel,
>>                          unsigned src_level, unsigned src_layer,
>>                          struct intel_mipmap_tree *dst_mt,
>>                          unsigned dst_level, unsigned dst_layer,
>> -                        int src_x0, int src_y0,
>> -                        int dst_x0, int dst_y0,
>> -                        int dst_x1, int dst_y1,
>> +                        float src_x0, float src_y0,
>> +                        float src_x1, float src_y1,
>> +                        float dst_x0, float dst_y0,
>> +                        float dst_x1, float dst_y1,
>>                          bool mirror_x, bool mirror_y)
>>  {
>>     intel_miptree_slice_resolve_depth(intel, src_mt, src_level,
>> src_layer);
>> @@ -139,6 +141,7 @@ brw_blorp_blit_miptrees(struct intel_context *intel,
>>                                  src_mt, src_level, src_layer,
>>                                  dst_mt, dst_level, dst_layer,
>>                                  src_x0, src_y0,
>> +                                src_x1, src_y1,
>>                                  dst_x0, dst_y0,
>>                                  dst_x1, dst_y1,
>>                                  mirror_x, mirror_y);
>> @@ -151,8 +154,8 @@ static void
>>  do_blorp_blit(struct intel_context *intel, GLbitfield buffer_bit,
>>                struct intel_renderbuffer *src_irb,
>>                struct intel_renderbuffer *dst_irb,
>> -              GLint srcX0, GLint srcY0,
>> -              GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
>> +              GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat srcY1,
>> +              GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat dstY1,
>>                bool mirror_x, bool mirror_y)
>>  {
>>     /* Find source/dst miptrees */
>> @@ -163,7 +166,8 @@ do_blorp_blit(struct intel_context *intel, GLbitfield
>> buffer_bit,
>>     brw_blorp_blit_miptrees(intel,
>>                             src_mt, src_irb->mt_level, src_irb->mt_layer,
>>                             dst_mt, dst_irb->mt_level, dst_irb->mt_layer,
>> -                           srcX0, srcY0, dstX0, dstY0, dstX1, dstY1,
>> +                           srcX0, srcY0, srcX1, srcY1,
>> +                           dstX0, dstY0, dstX1, dstY1,
>>                             mirror_x, mirror_y);
>>
>>     intel_renderbuffer_set_needs_downsample(dst_irb);
>> @@ -203,8 +207,8 @@ formats_match(GLbitfield buffer_bit, struct
>> intel_renderbuffer *src_irb,
>>
>>  static bool
>>  try_blorp_blit(struct intel_context *intel,
>> -               GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
>> -               GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
>> +               GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat
>> srcY1,
>> +               GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat
>> dstY1,
>>                 GLenum filter, GLbitfield buffer_bit)
>>  {
>>     struct gl_context *ctx = &intel->ctx;
>> @@ -224,9 +228,13 @@ try_blorp_blit(struct intel_context *intel,
>>     fixup_mirroring(mirror_y, srcY0, srcY1);
>>     fixup_mirroring(mirror_y, dstY0, dstY1);
>>
>> -   /* Make sure width and height match */
>> -   if (srcX1 - srcX0 != dstX1 - dstX0) return false;
>> -   if (srcY1 - srcY0 != dstY1 - dstY0) return false;
>> +   /* Linear filtering is not yet implemented in blorp. So, do not use
>> blorp
>> +    * engine for single sampled scaled blits.
>> +    */
>> +   if ((srcX1 - srcX0 != dstX1 - dstX0 ||
>> +        srcY1 - srcY0 != dstY1 - dstY0) &&
>> +       read_fb->Visual.samples == 0)
>
>
> It's a little strange that your comment mentions linear filtering but the
> code doesn't.  How about this instead (I believe it's equivalent)?
>
> if ((srcX1 - srcX0 != dstX1 - dstX0 ||
>      srcY1 - srcY0 != dstY1 - dstY0) &&
>      filter == GL_LINEAR)
>
Yeah. This is what I intended to write initially. But making this change causes
a piglit subtest fail.
./bin/fbo-blit-stretch -auto
45x79 (13, 33)-(30, 44) => 200x150 (19, 23)-(87, 67) stretch_x stretch_y nearest
Probe at (55,23)
  Expected: 0.000000 1.000000 0.000000
  Observed: 1.000000 0.000000 0.000000

Test seems to fail at the edge pixel. Other sub tests with stretch_x stretch_y
nearest passes. This sub test looks too strict at the edges.
No failures are observed in gles3 CTS with above change.
I have three options now:
1. Hold the patch and investigate the failing piglit test case.
2. Push the patch without suggested change.
3. Relying on gles3 CTS results, push the patch with suggested change.
I'm little inclined towards 3. What do you think?

>>
>> +      return false;
>>
>>     /* If the destination rectangle needs to be clipped or scissored, do
>> so.
>>      */
>> @@ -278,7 +286,8 @@ try_blorp_blit(struct intel_context *intel,
>>           dst_irb =
>> intel_renderbuffer(ctx->DrawBuffer->_ColorDrawBuffers[i]);
>>          if (dst_irb)
>>              do_blorp_blit(intel, buffer_bit, src_irb, dst_irb, srcX0,
>> srcY0,
>> -                          dstX0, dstY0, dstX1, dstY1, mirror_x,
>> mirror_y);
>> +                          srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
>> +                          mirror_x, mirror_y);
>>        }
>>        break;
>>     case GL_DEPTH_BUFFER_BIT:
>> @@ -289,7 +298,8 @@ try_blorp_blit(struct intel_context *intel,
>>        if (!formats_match(buffer_bit, src_irb, dst_irb))
>>           return false;
>>        do_blorp_blit(intel, buffer_bit, src_irb, dst_irb, srcX0, srcY0,
>> -                    dstX0, dstY0, dstX1, dstY1, mirror_x, mirror_y);
>> +                    srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
>> +                    mirror_x, mirror_y);
>>        break;
>>     case GL_STENCIL_BUFFER_BIT:
>>        src_irb =
>> @@ -299,7 +309,8 @@ try_blorp_blit(struct intel_context *intel,
>>        if (!formats_match(buffer_bit, src_irb, dst_irb))
>>           return false;
>>        do_blorp_blit(intel, buffer_bit, src_irb, dst_irb, srcX0, srcY0,
>> -                    dstX0, dstY0, dstX1, dstY1, mirror_x, mirror_y);
>> +                    srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
>> +                    mirror_x, mirror_y);
>>        break;
>>     default:
>>        assert(false);
>> @@ -347,6 +358,7 @@ brw_blorp_copytexsubimage(struct intel_context *intel,
>>      */
>>
>>     int srcY1 = srcY0 + height;
>> +   int srcX1 = srcX0 + width;
>>     int dstX1 = dstX0 + width;
>>     int dstY1 = dstY0 + height;
>>
>> @@ -364,7 +376,8 @@ brw_blorp_copytexsubimage(struct intel_context *intel,
>>     brw_blorp_blit_miptrees(intel,
>>                             src_mt, src_irb->mt_level, src_irb->mt_layer,
>>                             dst_mt, dst_image->Level, dst_image->Face,
>> -                           srcX0, srcY0, dstX0, dstY0, dstX1, dstY1,
>> +                           srcX0, srcY0, srcX1, srcY1,
>> +                           dstX0, dstY0, dstX1, dstY1,
>>                             false, mirror_y);
>>
>>     /* If we're copying to a packed depth stencil texture and the source
>> @@ -380,7 +393,8 @@ brw_blorp_copytexsubimage(struct intel_context *intel,
>>        brw_blorp_blit_miptrees(intel,
>>                                src_irb->mt, src_irb->mt_level,
>> src_irb->mt_layer,
>>                                dst_mt, dst_image->Level, dst_image->Face,
>> -                              srcX0, srcY0, dstX0, dstY0, dstX1, dstY1,
>> +                              srcX0, srcY0, srcX1, srcY1,
>> +                              dstX0, dstY0, dstX1, dstY1,
>>                                false, mirror_y);
>>     }
>>
>> @@ -590,7 +604,7 @@ private:
>>     void encode_msaa(unsigned num_samples, intel_msaa_layout layout);
>>     void decode_msaa(unsigned num_samples, intel_msaa_layout layout);
>>     void kill_if_outside_dst_rect();
>> -   void translate_dst_to_src(unsigned intel_gen);
>> +   void translate_dst_to_src();
>>     void single_to_blend();
>>     void manual_blend(unsigned num_samples);
>>     void sample(struct brw_reg dst);
>> @@ -772,7 +786,7 @@ brw_blorp_blit_program::compile(struct brw_context
>> *brw,
>>        kill_if_outside_dst_rect();
>>
>>     /* Next, apply a translation to obtain coordinates in the source
>> image. */
>> -   translate_dst_to_src(brw->intel.gen);
>> +   translate_dst_to_src();
>>
>>     /* If the source image is not multisampled, then we want to fetch
>> sample
>>      * number 0, because that's the only sample there is.
>> @@ -844,7 +858,7 @@ brw_blorp_blit_program::alloc_push_const_regs(int
>> base_reg)
>>  #define CONST_LOC(name) offsetof(brw_blorp_wm_push_constants, name)
>>  #define ALLOC_REG(name) \
>>     this->name = \
>> -      brw_ud1_reg(BRW_GENERAL_REGISTER_FILE, base_reg, CONST_LOC(name) /
>> 4)
>> +      brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, base_reg, CONST_LOC(name) /
>> 4)
>>
>>     ALLOC_REG(dst_x0);
>>     ALLOC_REG(dst_x1);
>> @@ -1322,29 +1336,37 @@ brw_blorp_blit_program::kill_if_outside_dst_rect()
>>   * coordinates.
>>   */
>>  void
>> -brw_blorp_blit_program::translate_dst_to_src(unsigned intel_gen)
>> +brw_blorp_blit_program::translate_dst_to_src()
>>  {
>> +   struct brw_reg X_f = retype(X, BRW_REGISTER_TYPE_F);
>> +   struct brw_reg Y_f = retype(Y, BRW_REGISTER_TYPE_F);
>> +   struct brw_reg Xp_f = retype(Xp, BRW_REGISTER_TYPE_F);
>> +   struct brw_reg Yp_f = retype(Yp, BRW_REGISTER_TYPE_F);
>> +
>>     brw_set_compression_control(&func, BRW_COMPRESSION_COMPRESSED);
>> -   /* For mul instruction:
>> -    * On SNB when both src0 and src1 are of type D or UD, only the low 16
>> bits
>> -    * of each element of src0 are used.
>> -    * On IVB when both src0 and src1 are of type D or UD, only the low 16
>> bits
>> -    * of each element of src1 are used.
>> -    * multiplier can be positive or negative. So keep the multiplier in a
>> src
>> -    * register which don't get truncated during multiplication.
>> -    */
>> -   if (intel_gen == 6) {
>> -      brw_MUL(&func, Xp, X, x_transform.multiplier);
>> -      brw_MUL(&func, Yp, Y, y_transform.multiplier);
>> +   /* Move the UD coordinates to float registers. */
>> +   brw_MOV(&func, Xp_f, X);
>> +   brw_MOV(&func, Yp_f, Y);
>> +   /* Scale and offset */
>> +   brw_MUL(&func, X_f, Xp_f, x_transform.multiplier);
>> +   brw_MUL(&func, Y_f, Yp_f, y_transform.multiplier);
>> +   brw_ADD(&func, X_f, X_f, x_transform.offset);
>> +   brw_ADD(&func, Y_f, Y_f, y_transform.offset);
>> +   if (key->blit_scaled) {
>> +      /* Round the float coordinates down to nearest integer. */
>> +      brw_RNDD(&func, Xp_f, X_f);
>> +      brw_RNDD(&func, Yp_f, Y_f);
>> +      /* Move the float coordinates back to UD registers. */
>> +      brw_MOV(&func, X, Xp_f);
>> +      brw_MOV(&func, Y, Yp_f);
>>     }
>>     else {
>> -      brw_MUL(&func, Xp, x_transform.multiplier, X);
>> -      brw_MUL(&func, Yp, y_transform.multiplier, Y);
>> +      /* Move the float coordinates back to UD registers. */
>> +      brw_MOV(&func, Xp, X_f);
>> +      brw_MOV(&func, Yp, Y_f);
>> +      SWAP_XY_AND_XPYP();
>>     }
>
>
> I don't believe key->blit_scaled is needed.  Both branches of the if
> statement compute the same thing, since ((int) f) == ((int) RNDD(f)).  Let's
> just use the code in the "else" block, since that's fewer instructions.
>
right. consider it fixed.
>>
>> -   brw_ADD(&func, Xp, Xp, x_transform.offset);
>> -   brw_ADD(&func, Yp, Yp, y_transform.offset);
>>     brw_set_compression_control(&func, BRW_COMPRESSION_NONE);
>> -   SWAP_XY_AND_XPYP();
>>  }
>>
>>  /**
>> @@ -1709,29 +1731,30 @@ brw_blorp_blit_program::render_target_write()
>>
>>
>>  void
>> -brw_blorp_coord_transform_params::setup(GLuint src0, GLuint dst0, GLuint
>> dst1,
>> +brw_blorp_coord_transform_params::setup(GLfloat src0, GLfloat src1,
>> +                                        GLfloat dst0, GLfloat dst1,
>>                                          bool mirror)
>>  {
>> +   float scale = (src1 - src0) / (dst1 - dst0);
>>     if (!mirror) {
>>        /* When not mirroring a coordinate (say, X), we need:
>> -       *   x' - src_x0 = x - dst_x0
>> +       *   x' - src_x0 = x - dst_x0 + 0.5
>>         * Therefore:
>> -       *   x' = 1*x + (src_x0 - dst_x0)
>> +       *   x' = 1*x + (src_x0 - dst_x0 + 0.5)
>>         */
>> -      multiplier = 1;
>> -      offset = (int) (src0 - dst0);
>> +      multiplier = scale;
>> +      offset = (src0 - dst0 + 0.5) * scale;
>>     } else {
>>        /* When mirroring X we need:
>> -       *   x' - src_x0 = dst_x1 - x - 1
>> +       *   x' - src_x0 = dst_x1 - x - 0.5
>>         * Therefore:
>> -       *   x' = -1*x + (src_x0 + dst_x1 - 1)
>> +       *   x' = -1*x + (src_x0 + dst_x1 - 0.5)
>>         */
>> -      multiplier = -1;
>> -      offset = src0 + dst1 - 1;
>> +      multiplier = -scale;
>> +      offset = (src0 + dst1 - 0.5) * scale;
>>     }
>>  }
>
>
> It would be nice to have a comment in the above function saying why the
> 0.5's are necessary.  If I'm not mistaken, the reason they are necessary is
> that the blorp program uses "round toward zero" to convert the transformed
> floating point coordinates to integer coordinates, whereas the behaviour we
> actually want is "round to nearest", so 0.5 provides the necessary
> correction.
>
I'll put this comment in the code.
>>
>>
>> -
>>  /**
>>   * Determine which MSAA layout the GPU pipeline should be configured for,
>>   * based on the chip generation, the number of samples, and the true
>> layout of
>> @@ -1766,9 +1789,10 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct
>> brw_context *brw,
>>                                               unsigned src_level, unsigned
>> src_layer,
>>                                               struct intel_mipmap_tree
>> *dst_mt,
>>                                               unsigned dst_level, unsigned
>> dst_layer,
>> -                                             GLuint src_x0, GLuint
>> src_y0,
>> -                                             GLuint dst_x0, GLuint
>> dst_y0,
>> -                                             GLuint dst_x1, GLuint
>> dst_y1,
>> +                                             GLfloat src_x0, GLfloat
>> src_y0,
>> +                                             GLfloat src_x1, GLfloat
>> src_y1,
>> +                                             GLfloat dst_x0, GLfloat
>> dst_y0,
>> +                                             GLfloat dst_x1, GLfloat
>> dst_y1,
>>                                               bool mirror_x, bool
>> mirror_y)
>>  {
>>     src.set(brw, src_mt, src_level, src_layer);
>> @@ -1834,6 +1858,11 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct
>> brw_context *brw,
>>        wm_prog_key.persample_msaa_dispatch = true;
>>     }
>>
>> +   /* Scaled blitting or not. */
>> +   wm_prog_key.blit_scaled =
>> +      ((dst_x1 - dst_x0) == (src_x1 - src_x0) &&
>> +       (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true;
>> +
>
>
> This should also be able to go away.
>
> With that fixed, this patch is:
>
> Reviewed-by: Paul Berry <stereotype441 at gmail.com>
>
Thanks for reviewing Paul.