[Mesa-dev] [PATCH 2/3] intel: Add multisample scaled blitting in blorp engine

Anuj Phogat anuj.phogat at gmail.com
Wed May 1 14:10:20 PDT 2013


In traditional multisampled framebuffer rendering, color samples must be explicitly
resolved via BlitFramebuffer before doing the scaled blitting of the framebuffer.
So, scaled blitting of a multisample framebuffer takes two separate calls to
BlitFramebuffer.

This patch implements the functionality of doing multisampled scaled resolve using
just one BlitFramebuffer call. Important changes involved in this patch are listed
below:
1. Add fractional scale/offset capability.
2. Modify blorp setup code to handle scaled blits properly.
   - Do register allocation for UD and float registers.
   - Use float registers to do scale / offset texture coordinates.
   - Change offset computation to consider float coordinates.
   - Round the scaled & offset coordinates down to integer.
   - Make relevant changes to ensure correct blitting of stencil / depth buffers.
3. Linear filter is not yet implemented in blorp. So, don't use blorp engine to do
   single sampled scaled blitting.
4. if INTEL_DEBUG=blorp is set, recompile the blorp shader to dump the native code
   for blorp blit. This will not effect performance but assist in debugging.

Note: Observed no piglit regressions on sandybridge & ivybridge with these changes.

Signed-off-by: Anuj Phogat <anuj.phogat at gmail.com>
---
 src/mesa/drivers/dri/i965/brw_blorp.h          |  16 +-
 src/mesa/drivers/dri/i965/brw_blorp_blit.cpp   | 581 +++++++++++++++++--------
 src/mesa/drivers/dri/intel/intel_mipmap_tree.c |   2 +
 3 files changed, 411 insertions(+), 188 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h
index 8915080..eb3a1e3 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -41,6 +41,7 @@ brw_blorp_blit_miptrees(struct intel_context *intel,
                         struct intel_mipmap_tree *dst_mt,
                         unsigned dst_level, unsigned dst_layer,
                         int src_x0, int src_y0,
+                        int src_x1, int src_y1,
                         int dst_x0, int dst_y0,
                         int dst_x1, int dst_y1,
                         bool mirror_x, bool mirror_y);
@@ -158,11 +159,11 @@ public:
 
 struct brw_blorp_coord_transform_params
 {
-   void setup(GLuint src0, GLuint dst0, GLuint dst1,
+   void setup(GLuint src0, GLuint src1, GLuint dst0, GLuint dst1,
               bool mirror);
 
-   int16_t multiplier;
-   int16_t offset;
+   float multiplier;
+   float offset;
 };
 
 
@@ -176,7 +177,7 @@ struct brw_blorp_wm_push_constants
    brw_blorp_coord_transform_params y_transform;
 
    /* Pad out to an integral number of registers */
-   uint16_t pad[8];
+   uint16_t pad[4];
 };
 
 /* Every 32 bytes of push constant data constitutes one GEN register. */
@@ -307,6 +308,10 @@ struct brw_blorp_blit_prog_key
     * than one sample per pixel.
     */
    bool persample_msaa_dispatch;
+
+   /* True for scaled blitting.
+    */
+   bool blit_scaled;
 };
 
 class brw_blorp_blit_params : public brw_blorp_params
@@ -318,8 +323,9 @@ public:
                          struct intel_mipmap_tree *dst_mt,
                          unsigned dst_level, unsigned dst_layer,
                          GLuint src_x0, GLuint src_y0,
+                         GLuint src_x1, GLuint src_y1,
                          GLuint dst_x0, GLuint dst_y0,
-                         GLuint width, GLuint height,
+                         GLuint dst_x1, GLuint dst_y1,
                          bool mirror_x, bool mirror_y);
 
    virtual uint32_t get_wm_prog(struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index d4b1fda..fd3ce57 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -128,6 +128,7 @@ brw_blorp_blit_miptrees(struct intel_context *intel,
                         struct intel_mipmap_tree *dst_mt,
                         unsigned dst_level, unsigned dst_layer,
                         int src_x0, int src_y0,
+                        int src_x1, int src_y1,
                         int dst_x0, int dst_y0,
                         int dst_x1, int dst_y1,
                         bool mirror_x, bool mirror_y)
@@ -136,6 +137,7 @@ brw_blorp_blit_miptrees(struct intel_context *intel,
                                 src_mt, src_level, src_layer,
                                 dst_mt, dst_level, dst_layer,
                                 src_x0, src_y0,
+                                src_x1, src_y1,
                                 dst_x0, dst_y0,
                                 dst_x1, dst_y1,
                                 mirror_x, mirror_y);
@@ -147,6 +149,7 @@ do_blorp_blit(struct intel_context *intel, GLbitfield buffer_bit,
               struct intel_renderbuffer *src_irb,
               struct intel_renderbuffer *dst_irb,
               GLint srcX0, GLint srcY0,
+              GLint srcX1, GLint srcY1,
               GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
               bool mirror_x, bool mirror_y)
 {
@@ -164,7 +167,8 @@ do_blorp_blit(struct intel_context *intel, GLbitfield buffer_bit,
    brw_blorp_blit_miptrees(intel,
                            src_mt, src_irb->mt_level, src_irb->mt_layer,
                            dst_mt, dst_irb->mt_level, dst_irb->mt_layer,
-                           srcX0, srcY0, dstX0, dstY0, dstX1, dstY1,
+                           srcX0, srcY0, srcX1, srcY1,
+                           dstX0, dstY0, dstX1, dstY1,
                            mirror_x, mirror_y);
 
    intel_renderbuffer_set_needs_hiz_resolve(dst_irb);
@@ -221,9 +225,13 @@ try_blorp_blit(struct intel_context *intel,
    fixup_mirroring(mirror_y, srcY0, srcY1);
    fixup_mirroring(mirror_y, dstY0, dstY1);
 
-   /* Make sure width and height match */
-   if (srcX1 - srcX0 != dstX1 - dstX0) return false;
-   if (srcY1 - srcY0 != dstY1 - dstY0) return false;
+   /* Linear filtering is not yet implemented in blorp. So, do not use blorp
+    * engine for single sampled scaled blits.
+    */
+   if ((srcX1 - srcX0 != dstX1 - dstX0 ||
+        srcY1 - srcY0 != dstY1 - dstY0) &&
+       read_fb->Visual.samples == 0)
+      return false;
 
    /* If the destination rectangle needs to be clipped or scissored, do so.
     */
@@ -274,8 +282,10 @@ try_blorp_blit(struct intel_context *intel,
       for (unsigned i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; ++i) {
          dst_irb = intel_renderbuffer(ctx->DrawBuffer->_ColorDrawBuffers[i]);
 	 if (dst_irb)
-            do_blorp_blit(intel, buffer_bit, src_irb, dst_irb, srcX0, srcY0,
-                          dstX0, dstY0, dstX1, dstY1, mirror_x, mirror_y);
+            do_blorp_blit(intel, buffer_bit, src_irb, dst_irb,
+                          srcX0, srcY0, srcX1, srcY1,
+                          dstX0, dstY0, dstX1, dstY1,
+                          mirror_x, mirror_y);
       }
       break;
    case GL_DEPTH_BUFFER_BIT:
@@ -285,8 +295,10 @@ try_blorp_blit(struct intel_context *intel,
          intel_renderbuffer(draw_fb->Attachment[BUFFER_DEPTH].Renderbuffer);
       if (!formats_match(buffer_bit, src_irb, dst_irb))
          return false;
-      do_blorp_blit(intel, buffer_bit, src_irb, dst_irb, srcX0, srcY0,
-                    dstX0, dstY0, dstX1, dstY1, mirror_x, mirror_y);
+      do_blorp_blit(intel, buffer_bit, src_irb, dst_irb,
+                    srcX0, srcY0, srcX1, srcY1,
+                    dstX0, dstY0, dstX1, dstY1,
+                    mirror_x, mirror_y);
       break;
    case GL_STENCIL_BUFFER_BIT:
       src_irb =
@@ -295,8 +307,10 @@ try_blorp_blit(struct intel_context *intel,
          intel_renderbuffer(draw_fb->Attachment[BUFFER_STENCIL].Renderbuffer);
       if (!formats_match(buffer_bit, src_irb, dst_irb))
          return false;
-      do_blorp_blit(intel, buffer_bit, src_irb, dst_irb, srcX0, srcY0,
-                    dstX0, dstY0, dstX1, dstY1, mirror_x, mirror_y);
+      do_blorp_blit(intel, buffer_bit, src_irb, dst_irb,
+                    srcX0, srcY0, srcX1, srcY1,
+                    dstX0, dstY0, dstX1, dstY1,
+                    mirror_x, mirror_y);
       break;
    default:
       assert(false);
@@ -352,6 +366,7 @@ brw_blorp_copytexsubimage(struct intel_context *intel,
     */
 
    int srcY1 = srcY0 + height;
+   int srcX1 = srcX0 + width;
    int dstX1 = dstX0 + width;
    int dstY1 = dstY0 + height;
 
@@ -372,7 +387,9 @@ brw_blorp_copytexsubimage(struct intel_context *intel,
    }
 
    do_blorp_blit(intel, buffer_bit, src_irb, dst_irb,
-                 srcX0, srcY0, dstX0, dstY0, dstX1, dstY1, false, mirror_y);
+                 srcX0, srcY0, srcX1, srcY1,
+                 dstX0, dstY0, dstX1, dstY1,
+                 false, mirror_y);
 
    /* If we're copying a packed depth stencil texture, the above do_blorp_blit
     * copied depth (since buffer_bit != GL_STENCIL_BIT).  Now copy stencil as
@@ -384,7 +401,9 @@ brw_blorp_copytexsubimage(struct intel_context *intel,
        src_rb != NULL) {
       src_irb = intel_renderbuffer(src_rb);
       do_blorp_blit(intel, GL_STENCIL_BUFFER_BIT, src_irb, dst_irb,
-                    srcX0, srcY0, dstX0, dstY0, dstX1, dstY1, false, mirror_y);
+                    srcX0, srcY0, srcX1, srcY1,
+                    dstX0, dstY0, dstX1, dstY1,
+                    false, mirror_y);
    }
 
    dst_rb->Delete(ctx, dst_rb);
@@ -590,9 +609,22 @@ private:
    void alloc_regs();
    void alloc_push_const_regs(int base_reg);
    void compute_frag_coords();
-   void translate_tiling(bool old_tiled_w, bool new_tiled_w);
-   void encode_msaa(unsigned num_samples, intel_msaa_layout layout);
-   void decode_msaa(unsigned num_samples, intel_msaa_layout layout);
+   void translate_tiling(struct brw_reg rX, struct brw_reg rY,
+                         struct brw_reg rXp, struct brw_reg rYp,
+                         struct brw_reg rt1, struct brw_reg rt2,
+                         unsigned type, bool old_tiled_w, bool new_tiled_w);
+   void encode_msaa(struct brw_reg rX, struct brw_reg rY,
+                    struct brw_reg rXp, struct brw_reg rYp,
+                    struct brw_reg rt1, struct brw_reg rt2,
+                    struct brw_reg rS, unsigned type,
+                    unsigned num_samples,
+                    intel_msaa_layout layout);
+   void decode_msaa(struct brw_reg rX, struct brw_reg rY,
+                    struct brw_reg rXp, struct brw_reg rYp,
+                    struct brw_reg rt1, struct brw_reg rt2,
+                    struct brw_reg rS, unsigned type,
+                    unsigned num_samples,
+                    intel_msaa_layout layout);
    void kill_if_outside_dst_rect();
    void translate_dst_to_src();
    void single_to_blend();
@@ -641,27 +673,36 @@ private:
     */
    struct brw_reg mcs_data;
 
-   /* X coordinates.  We have two of them so that we can perform coordinate
-    * transformations easily.
+   /* X coordinates.  We have two of them for each type so that we can perform
+    * coordinate transformations easily.
     */
-   struct brw_reg x_coords[2];
+   struct brw_reg x_coords_uw[2];
+   struct brw_reg x_coords_ud[2];
+   struct brw_reg x_coords_f[2];
 
-   /* Y coordinates.  We have two of them so that we can perform coordinate
-    * transformations easily.
+   /* Y coordinates.  We have two of them for each type so that we can perform
+    * coordinate transformations easily.
     */
-   struct brw_reg y_coords[2];
+   struct brw_reg y_coords_uw[2];
+   struct brw_reg y_coords_ud[2];
+   struct brw_reg y_coords_f[2];
 
-   /* Which element of x_coords and y_coords is currently in use.
+   /* Which element of x_coords_ud and y_coords_ud is currently in use.
     */
    int xy_coord_index;
 
+   /* Which element of x_coords_uw and y_coords_uw is currently in use.
+    */
+   int xy_uw_coord_index;
+
    /* True if, at the point in the program currently being compiled, the
     * sample index is known to be zero.
     */
    bool s_is_zero;
 
-   /* Register storing the sample index when s_is_zero is false. */
-   struct brw_reg sample_index;
+   /* Registers storing the sample index when s_is_zero is false. */
+   struct brw_reg sample_index_uw;
+   struct brw_reg sample_index_ud[2];
 
    /* Temporaries */
    struct brw_reg t1;
@@ -686,6 +727,41 @@ brw_blorp_blit_program::~brw_blorp_blit_program()
    ralloc_free(mem_ctx);
 }
 
+/* In the code that follows, (X, Y) and (Xuw, Yuw) can be used to quickly
+ * refer to the active elements of (x_coords_ud, y_coords_ud) and
+ * (x_coords_uw, y_coords_uw) respectively . (Xp, Yp) and (Xuw_p, Yuw_p)
+ * ("X prime" and "Y prime") refer to the inactive elements.
+ *
+ * S, S1, S2 can be used to quickly refer to different type of sample_index
+ * registers.
+ */
+#define X x_coords_ud[xy_coord_index]
+#define Y y_coords_ud[xy_coord_index]
+#define Xp x_coords_ud[!xy_coord_index]
+#define Yp y_coords_ud[!xy_coord_index]
+
+#define Xuw x_coords_uw[xy_uw_coord_index]
+#define Yuw y_coords_uw[xy_uw_coord_index]
+#define Xuw_p x_coords_uw[!xy_uw_coord_index]
+#define Yuw_p y_coords_uw[!xy_uw_coord_index]
+
+#define Xf x_coords_f[0]
+#define Yf y_coords_f[0]
+#define Xf_p x_coords_f[1]
+#define Yf_p y_coords_f[1]
+
+#define S sample_index_uw
+#define S1 sample_index_ud[0]
+#define S2 sample_index_ud[1]
+
+/* Quickly swap the roles of (X, Y) and (Xp, Yp).  Saves us from having to do
+ * MOVs to transfer (Xp, Yp) to (X, Y) after a coordinate transformation.
+ */
+#define SWAP_XY_AND_XPYP() xy_coord_index = !xy_coord_index;
+
+/* Quickly swap the roles of (Xuw, Yuw) and (Xuw_p, Yuw_p). */
+#define SWAP_XY_AND_XPYP_UW() xy_uw_coord_index = !xy_uw_coord_index;
+
 const GLuint *
 brw_blorp_blit_program::compile(struct brw_context *brw,
                                 GLuint *program_size)
@@ -742,7 +818,7 @@ brw_blorp_blit_program::compile(struct brw_context *brw,
    /* Render target and texture hardware don't support W tiling. */
    const bool rt_tiled_w = false;
    const bool tex_tiled_w = false;
-
+   unsigned int type = BRW_REGISTER_TYPE_UW;
    /* The address that data will be written to is determined by the
     * coordinates supplied to the WM thread and the tiling and sample count of
     * the render target, according to the formula:
@@ -757,11 +833,14 @@ brw_blorp_blit_program::compile(struct brw_context *brw,
    if (rt_tiled_w != key->dst_tiled_w ||
        key->rt_samples != key->dst_samples ||
        key->rt_layout != key->dst_layout) {
-      encode_msaa(key->rt_samples, key->rt_layout);
+      encode_msaa(Xuw, Yuw, Xuw_p, Yuw_p, t1, t2, S, type,
+                  key->rt_samples, key->rt_layout);
       /* Now (X, Y, S) = detile(rt_tiling, offset) */
-      translate_tiling(rt_tiled_w, key->dst_tiled_w);
+      translate_tiling(Xuw, Yuw, Xuw_p, Yuw_p, t1, t2, type,
+                       rt_tiled_w, key->dst_tiled_w);
       /* Now (X, Y, S) = detile(dst_tiling, offset) */
-      decode_msaa(key->dst_samples, key->dst_layout);
+      decode_msaa(Xuw, Yuw, Xuw_p, Yuw_p, t1, t2, S, type,
+                  key->dst_samples, key->dst_layout);
    }
 
    /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
@@ -779,6 +858,7 @@ brw_blorp_blit_program::compile(struct brw_context *brw,
    /* Next, apply a translation to obtain coordinates in the source image. */
    translate_dst_to_src();
 
+   type = BRW_REGISTER_TYPE_UD;
    /* If the source image is not multisampled, then we want to fetch sample
     * number 0, because that's the only sample there is.
     */
@@ -791,7 +871,7 @@ brw_blorp_blit_program::compile(struct brw_context *brw,
     */
    if (key->blend) {
       if (brw->intel.gen == 6) {
-         /* Gen6 hardware an automatically blend using the SAMPLE message */
+         /* Gen6 hardware can automatically blend using the SAMPLE message */
          single_to_blend();
          sample(texture_data[0]);
       } else {
@@ -812,11 +892,20 @@ brw_blorp_blit_program::compile(struct brw_context *brw,
       if (tex_tiled_w != key->src_tiled_w ||
           key->tex_samples != key->src_samples ||
           key->tex_layout != key->src_layout) {
-         encode_msaa(key->src_samples, key->src_layout);
+         brw_set_compression_control(&func, BRW_COMPRESSION_COMPRESSED);
+         encode_msaa(vec8(X), vec8(Y), vec8(Xp), vec8(Yp),
+                     vec8(retype(t1, type)), vec8(retype(t2, type)),
+                     S1, type, key->src_samples, key->src_layout);
          /* Now (X, Y, S) = detile(src_tiling, offset) */
-         translate_tiling(key->src_tiled_w, tex_tiled_w);
+         translate_tiling(vec8(X), vec8(Y), vec8(Xp), vec8(Yp),
+                          vec8(retype(t1, type)), vec8(retype(t2, type)),
+                          BRW_REGISTER_TYPE_UD,
+                          key->src_tiled_w, tex_tiled_w);
          /* Now (X, Y, S) = detile(tex_tiling, offset) */
-         decode_msaa(key->tex_samples, key->tex_layout);
+         decode_msaa(vec8(X), vec8(Y), vec8(Xp), vec8(Yp),
+                     vec8(retype(t1, type)), vec8(retype(t2, type)),
+                     S1, type, key->tex_samples, key->tex_layout);
+         brw_set_compression_control(&func, BRW_COMPRESSION_NONE);
       }
 
       /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
@@ -851,14 +940,18 @@ brw_blorp_blit_program::alloc_push_const_regs(int base_reg)
    this->name = \
       brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, base_reg, CONST_LOC(name) / 2)
 
+#define ALLOC_REG_F(name) \
+   this->name = \
+      brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, base_reg, CONST_LOC(name) / 4)
+
    ALLOC_REG(dst_x0);
    ALLOC_REG(dst_x1);
    ALLOC_REG(dst_y0);
    ALLOC_REG(dst_y1);
-   ALLOC_REG(x_transform.multiplier);
-   ALLOC_REG(x_transform.offset);
-   ALLOC_REG(y_transform.multiplier);
-   ALLOC_REG(y_transform.offset);
+   ALLOC_REG_F(x_transform.multiplier);
+   ALLOC_REG_F(x_transform.offset);
+   ALLOC_REG_F(y_transform.multiplier);
+   ALLOC_REG_F(y_transform.offset);
 #undef CONST_LOC
 #undef ALLOC_REG
 }
@@ -880,16 +973,41 @@ brw_blorp_blit_program::alloc_regs()
    this->mcs_data =
       retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD); reg += 8;
    for (int i = 0; i < 2; ++i) {
-      this->x_coords[i]
+      this->x_coords_uw[i]
          = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
-      this->y_coords[i]
+      this->y_coords_uw[i]
          = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
    }
+   for (int i = 0; i < 2; ++i) {
+      this->x_coords_f[i]
+         = vec8(brw_vec8_grf(reg, 0));
+      reg += 2;
+      this->y_coords_f[i]
+         = vec8(brw_vec8_grf(reg, 0));
+      reg += 2;
+   }
+   for (int i = 0; i < 2; ++i) {
+      this->x_coords_ud[i]
+         = vec16(retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD));
+      reg += 2;
+      this->y_coords_ud[i]
+         = vec16(retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD));
+      reg += 2;
+   }
    this->xy_coord_index = 0;
-   this->sample_index
+   this->xy_uw_coord_index = 0;
+
+   this->sample_index_uw
       = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
-   this->t1 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
-   this->t2 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
+   this->sample_index_ud[0]
+      = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UD));
+   this->sample_index_ud[1]
+      = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UD));
+
+   this->t1 = vec16(retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UW));
+   reg += 2;
+   this->t2 = vec16(retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UW));
+   reg += 2;
 
    /* Make sure we didn't run out of registers */
    assert(reg <= GEN7_MRF_HACK_START);
@@ -898,23 +1016,6 @@ brw_blorp_blit_program::alloc_regs()
    this->base_mrf = mrf;
 }
 
-/* In the code that follows, X and Y can be used to quickly refer to the
- * active elements of x_coords and y_coords, and Xp and Yp ("X prime" and "Y
- * prime") to the inactive elements.
- *
- * S can be used to quickly refer to sample_index.
- */
-#define X x_coords[xy_coord_index]
-#define Y y_coords[xy_coord_index]
-#define Xp x_coords[!xy_coord_index]
-#define Yp y_coords[!xy_coord_index]
-#define S sample_index
-
-/* Quickly swap the roles of (X, Y) and (Xp, Yp).  Saves us from having to do
- * MOVs to transfor (Xp, Yp) to (X, Y) after a coordinate transformation.
- */
-#define SWAP_XY_AND_XPYP() xy_coord_index = !xy_coord_index;
-
 /**
  * Emit code to compute the X and Y coordinates of the pixels being rendered
  * by this WM invocation.
@@ -946,7 +1047,7 @@ brw_blorp_blit_program::compute_frag_coords()
     * Then, we need to add the repeating sequence (0, 1, 0, 1, ...) to the
     * result, since pixels n+1 and n+3 are in the right half of the subspan.
     */
-   brw_ADD(&func, X, stride(suboffset(R1, 4), 2, 4, 0), brw_imm_v(0x10101010));
+   brw_ADD(&func, Xuw, stride(suboffset(R1, 4), 2, 4, 0), brw_imm_v(0x10101010));
 
    /* Similarly, Y coordinates for subspans come from R1.2[31:16] through
     * R1.5[31:16], so to get pixel Y coordinates we need to start at the 5th
@@ -956,7 +1057,7 @@ brw_blorp_blit_program::compute_frag_coords()
     * And we need to add the repeating sequence (0, 0, 1, 1, ...), since
     * pixels n+2 and n+3 are in the bottom half of the subspan.
     */
-   brw_ADD(&func, Y, stride(suboffset(R1, 5), 2, 4, 0), brw_imm_v(0x11001100));
+   brw_ADD(&func, Yuw, stride(suboffset(R1, 5), 2, 4, 0), brw_imm_v(0x11001100));
 
    if (key->persample_msaa_dispatch) {
       switch (key->rt_samples) {
@@ -1001,6 +1102,14 @@ brw_blorp_blit_program::compute_frag_coords()
                 "brw_blorp_blit_program::compute_frag_coords()");
          break;
       }
+
+      if (key->dst_tiled_w && key->dst_samples > 0) {
+         /* These instructions are required only for multisample to multisample
+          * stencil blits.
+          */
+         brw_MOV(&func, vec8(S1), vec8(S));
+         brw_MOV(&func, vec8(S2), suboffset(vec8(S), 8));
+      }
       s_is_zero = false;
    } else {
       /* Either the destination surface is single-sampled, or the WM will be
@@ -1025,7 +1134,11 @@ brw_blorp_blit_program::compute_frag_coords()
  * are booleans where true represents W tiling and false represents Y tiling.
  */
 void
-brw_blorp_blit_program::translate_tiling(bool old_tiled_w, bool new_tiled_w)
+brw_blorp_blit_program::translate_tiling(struct brw_reg rX, struct brw_reg rY,
+                                         struct brw_reg rXp, struct brw_reg rYp,
+                                         struct brw_reg rt1, struct brw_reg rt2,
+                                         unsigned type, bool old_tiled_w,
+                                         bool new_tiled_w)
 {
    if (old_tiled_w == new_tiled_w)
       return;
@@ -1063,22 +1176,21 @@ brw_blorp_blit_program::translate_tiling(bool old_tiled_w, bool new_tiled_w)
        *   X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1         (4)
        *   Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
        */
-      brw_AND(&func, t1, X, brw_imm_uw(0xfff4)); /* X & ~0b1011 */
-      brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b1011) >> 1 */
-      brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
-      brw_SHL(&func, t2, t2, brw_imm_uw(2)); /* (Y & 0b1) << 2 */
-      brw_OR(&func, t1, t1, t2); /* (X & ~0b1011) >> 1 | (Y & 0b1) << 2 */
-      brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
-      brw_OR(&func, Xp, t1, t2);
-      brw_AND(&func, t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
-      brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
-      brw_AND(&func, t2, X, brw_imm_uw(8)); /* X & 0b1000 */
-      brw_SHR(&func, t2, t2, brw_imm_uw(2)); /* (X & 0b1000) >> 2 */
-      brw_OR(&func, t1, t1, t2); /* (Y & ~0b1) << 1 | (X & 0b1000) >> 2 */
-      brw_AND(&func, t2, X, brw_imm_uw(2)); /* X & 0b10 */
-      brw_SHR(&func, t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
-      brw_OR(&func, Yp, t1, t2);
-      SWAP_XY_AND_XPYP();
+      brw_AND(&func, rt1, rX, brw_imm_uw(0xfff4)); /* X & ~0b1011 */
+      brw_SHR(&func, rt1, rt1, brw_imm_uw(1)); /* (X & ~0b1011) >> 1 */
+      brw_AND(&func, rt2, rY, brw_imm_uw(1)); /* Y & 0b1 */
+      brw_SHL(&func, rt2, rt2, brw_imm_uw(2)); /* (Y & 0b1) << 2 */
+      brw_OR(&func, rt1, rt1, rt2); /* (X & ~0b1011) >> 1 | (Y & 0b1) << 2 */
+      brw_AND(&func, rt2, rX, brw_imm_uw(1)); /* X & 0b1 */
+      brw_OR(&func, rXp, rt1, rt2);
+      brw_AND(&func, rt1, rY, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
+      brw_SHL(&func, rt1, rt1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
+      brw_AND(&func, rt2, rX, brw_imm_uw(8)); /* X & 0b1000 */
+      brw_SHR(&func, rt2, rt2, brw_imm_uw(2)); /* (X & 0b1000) >> 2 */
+      brw_OR(&func, rt1, rt1, rt2); /* (Y & ~0b1) << 1 | (X & 0b1000) >> 2 */
+      brw_AND(&func, rt2, rX, brw_imm_uw(2)); /* X & 0b10 */
+      brw_SHR(&func, rt2, rt2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
+      brw_OR(&func, rYp, rt1, rt2);
    } else {
       /* Applying the same logic as above, but in reverse, we obtain the
        * formulas:
@@ -1086,22 +1198,27 @@ brw_blorp_blit_program::translate_tiling(bool old_tiled_w, bool new_tiled_w)
        * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
        * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
        */
-      brw_AND(&func, t1, X, brw_imm_uw(0xfffa)); /* X & ~0b101 */
-      brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b101) << 1 */
-      brw_AND(&func, t2, Y, brw_imm_uw(2)); /* Y & 0b10 */
-      brw_SHL(&func, t2, t2, brw_imm_uw(2)); /* (Y & 0b10) << 2 */
-      brw_OR(&func, t1, t1, t2); /* (X & ~0b101) << 1 | (Y & 0b10) << 2 */
-      brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
-      brw_SHL(&func, t2, t2, brw_imm_uw(1)); /* (Y & 0b1) << 1 */
-      brw_OR(&func, t1, t1, t2); /* (X & ~0b101) << 1 | (Y & 0b10) << 2
+      brw_AND(&func, rt1, rX, brw_imm_uw(0xfffa)); /* X & ~0b101 */
+      brw_SHL(&func, rt1, rt1, brw_imm_uw(1)); /* (X & ~0b101) << 1 */
+      brw_AND(&func, rt2, rY, brw_imm_uw(2)); /* Y & 0b10 */
+      brw_SHL(&func, rt2, rt2, brw_imm_uw(2)); /* (Y & 0b10) << 2 */
+      brw_OR(&func, rt1, rt1, rt2); /* (X & ~0b101) << 1 | (Y & 0b10) << 2 */
+      brw_AND(&func, rt2, rY, brw_imm_uw(1)); /* Y & 0b1 */
+      brw_SHL(&func, rt2, rt2, brw_imm_uw(1)); /* (Y & 0b1) << 1 */
+      brw_OR(&func, rt1, rt1, rt2); /* (X & ~0b101) << 1 | (Y & 0b10) << 2
                                     | (Y & 0b1) << 1 */
-      brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
-      brw_OR(&func, Xp, t1, t2);
-      brw_AND(&func, t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
-      brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
-      brw_AND(&func, t2, X, brw_imm_uw(4)); /* X & 0b100 */
-      brw_SHR(&func, t2, t2, brw_imm_uw(2)); /* (X & 0b100) >> 2 */
-      brw_OR(&func, Yp, t1, t2);
+      brw_AND(&func, rt2, rX, brw_imm_uw(1)); /* X & 0b1 */
+      brw_OR(&func, rXp, rt1, rt2);
+      brw_AND(&func, rt1, rY, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
+      brw_SHR(&func, rt1, rt1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
+      brw_AND(&func, rt2, rX, brw_imm_uw(4)); /* X & 0b100 */
+      brw_SHR(&func, rt2, rt2, brw_imm_uw(2)); /* (X & 0b100) >> 2 */
+      brw_OR(&func, rYp, rt1, rt2);
+   }
+
+   if (type == BRW_REGISTER_TYPE_UW) {
+      SWAP_XY_AND_XPYP_UW();
+   } else {
       SWAP_XY_AND_XPYP();
    }
 }
@@ -1117,7 +1234,11 @@ brw_blorp_blit_program::translate_tiling(bool old_tiled_w, bool new_tiled_w)
  * (See brw_blorp_blit_program).
  */
 void
-brw_blorp_blit_program::encode_msaa(unsigned num_samples,
+brw_blorp_blit_program::encode_msaa(struct brw_reg rX, struct brw_reg rY,
+                                    struct brw_reg rXp, struct brw_reg rYp,
+                                    struct brw_reg rt1, struct brw_reg rt2,
+                                    struct brw_reg rS, unsigned type,
+                                    unsigned num_samples,
                                     intel_msaa_layout layout)
 {
    switch (layout) {
@@ -1137,57 +1258,76 @@ brw_blorp_blit_program::encode_msaa(unsigned num_samples,
    case INTEL_MSAA_LAYOUT_IMS:
       switch (num_samples) {
       case 4:
+         if (key->dst_samples > 0 && type == BRW_REGISTER_TYPE_UD) {
+            /* Move sample index to UD registers. Required only for multisample
+               to multisample stencil blits.
+             */
+            brw_MOV(&func, vec8(S1), S);
+            brw_MOV(&func, vec8(S2), suboffset(S, 8));
+         }
          /* encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
           *   where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
           *         Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
           */
-         brw_AND(&func, t1, X, brw_imm_uw(0xfffe)); /* X & ~0b1 */
+         brw_AND(&func, rt1, rX, brw_imm_uw(0xfffe)); /* X & ~0b1 */
          if (!s_is_zero) {
-            brw_AND(&func, t2, S, brw_imm_uw(1)); /* S & 0b1 */
-            brw_OR(&func, t1, t1, t2); /* (X & ~0b1) | (S & 0b1) */
+            brw_AND(&func, rt2, rS, brw_imm_uw(1)); /* S & 0b1 */
+            brw_OR(&func, rt1, rt1, rt2); /* (X & ~0b1) | (S & 0b1) */
          }
-         brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b1) << 1
+         brw_SHL(&func, rt1, rt1, brw_imm_uw(1)); /* (X & ~0b1) << 1
                                                    | (S & 0b1) << 1 */
-         brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
-         brw_OR(&func, Xp, t1, t2);
-         brw_AND(&func, t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
-         brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
+         brw_AND(&func, rt2, rX, brw_imm_uw(1)); /* X & 0b1 */
+         brw_OR(&func, rXp, rt1, rt2);
+         brw_AND(&func, rt1, rY, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
+         brw_SHL(&func, rt1, rt1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
          if (!s_is_zero) {
-            brw_AND(&func, t2, S, brw_imm_uw(2)); /* S & 0b10 */
-            brw_OR(&func, t1, t1, t2); /* (Y & ~0b1) << 1 | (S & 0b10) */
+            brw_AND(&func, rt2, rS, brw_imm_uw(2)); /* S & 0b10 */
+            brw_OR(&func, rt1, rt1, rt2); /* (Y & ~0b1) << 1 | (S & 0b10) */
          }
-         brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
-         brw_OR(&func, Yp, t1, t2);
+         brw_AND(&func, rt2, rY, brw_imm_uw(1)); /* Y & 0b1 */
+         brw_OR(&func, rYp, rt1, rt2);
          break;
       case 8:
+         if (key->dst_samples > 0 && type == BRW_REGISTER_TYPE_UD) {
+            /* Move sample index to UD registers. Required only for multisample
+               to multisample stencil blits.
+             */
+            brw_MOV(&func, vec8(S1), S);
+            brw_MOV(&func, vec8(S2), suboffset(S, 8));
+         }
          /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
           *   where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
           *              | (X & 0b1)
           *         Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
           */
-         brw_AND(&func, t1, X, brw_imm_uw(0xfffe)); /* X & ~0b1 */
-         brw_SHL(&func, t1, t1, brw_imm_uw(2)); /* (X & ~0b1) << 2 */
+         brw_AND(&func, rt1, rX, brw_imm_uw(0xfffe)); /* X & ~0b1 */
+         brw_SHL(&func, rt1, rt1, brw_imm_uw(2)); /* (X & ~0b1) << 2 */
          if (!s_is_zero) {
-            brw_AND(&func, t2, S, brw_imm_uw(4)); /* S & 0b100 */
-            brw_OR(&func, t1, t1, t2); /* (X & ~0b1) << 2 | (S & 0b100) */
-            brw_AND(&func, t2, S, brw_imm_uw(1)); /* S & 0b1 */
-            brw_SHL(&func, t2, t2, brw_imm_uw(1)); /* (S & 0b1) << 1 */
-            brw_OR(&func, t1, t1, t2); /* (X & ~0b1) << 2 | (S & 0b100)
+            brw_AND(&func, rt2, rS, brw_imm_uw(4)); /* S & 0b100 */
+            brw_OR(&func, rt1, rt1, rt2); /* (X & ~0b1) << 2 | (S & 0b100) */
+            brw_AND(&func, rt2, rS, brw_imm_uw(1)); /* S & 0b1 */
+            brw_SHL(&func, rt2, rt2, brw_imm_uw(1)); /* (S & 0b1) << 1 */
+            brw_OR(&func, rt1, rt1, rt2); /* (X & ~0b1) << 2 | (S & 0b100)
                                           | (S & 0b1) << 1 */
          }
-         brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
-         brw_OR(&func, Xp, t1, t2);
-         brw_AND(&func, t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
-         brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
+         brw_AND(&func, rt2, rX, brw_imm_uw(1)); /* X & 0b1 */
+         brw_OR(&func, rXp, rt1, rt2);
+         brw_AND(&func, rt1, rY, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
+         brw_SHL(&func, rt1, rt1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
          if (!s_is_zero) {
-            brw_AND(&func, t2, S, brw_imm_uw(2)); /* S & 0b10 */
-            brw_OR(&func, t1, t1, t2); /* (Y & ~0b1) << 1 | (S & 0b10) */
+            brw_AND(&func, rt2, rS, brw_imm_uw(2)); /* S & 0b10 */
+            brw_OR(&func, rt1, rt1, rt2); /* (Y & ~0b1) << 1 | (S & 0b10) */
          }
-         brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
-         brw_OR(&func, Yp, t1, t2);
+         brw_AND(&func, rt2, rY, brw_imm_uw(1)); /* Y & 0b1 */
+         brw_OR(&func, rYp, rt1, rt2);
          break;
       }
-      SWAP_XY_AND_XPYP();
+
+      if (type == BRW_REGISTER_TYPE_UW) {
+         SWAP_XY_AND_XPYP_UW();
+      } else {
+         SWAP_XY_AND_XPYP();
+      }
       s_is_zero = true;
       break;
    }
@@ -1204,7 +1344,11 @@ brw_blorp_blit_program::encode_msaa(unsigned num_samples,
  * (See brw_blorp_blit_program).
  */
 void
-brw_blorp_blit_program::decode_msaa(unsigned num_samples,
+brw_blorp_blit_program::decode_msaa(struct brw_reg rX, struct brw_reg rY,
+                                    struct brw_reg rXp, struct brw_reg rYp,
+                                    struct brw_reg rt1, struct brw_reg rt2,
+                                    struct brw_reg rS, unsigned type,
+                                    unsigned num_samples,
                                     intel_msaa_layout layout)
 {
    switch (layout) {
@@ -1230,18 +1374,18 @@ brw_blorp_blit_program::decode_msaa(unsigned num_samples,
           *         Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
           *         S = (Y & 0b10) | (X & 0b10) >> 1
           */
-         brw_AND(&func, t1, X, brw_imm_uw(0xfffc)); /* X & ~0b11 */
-         brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b11) >> 1 */
-         brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
-         brw_OR(&func, Xp, t1, t2);
-         brw_AND(&func, t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
-         brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
-         brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
-         brw_OR(&func, Yp, t1, t2);
-         brw_AND(&func, t1, Y, brw_imm_uw(2)); /* Y & 0b10 */
-         brw_AND(&func, t2, X, brw_imm_uw(2)); /* X & 0b10 */
-         brw_SHR(&func, t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
-         brw_OR(&func, S, t1, t2);
+         brw_AND(&func, rt1, rX, brw_imm_uw(0xfffc)); /* X & ~0b11 */
+         brw_SHR(&func, rt1, rt1, brw_imm_uw(1)); /* (X & ~0b11) >> 1 */
+         brw_AND(&func, rt2, rX, brw_imm_uw(1)); /* X & 0b1 */
+         brw_OR(&func, rXp, rt1, rt2);
+         brw_AND(&func, rt1, rY, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
+         brw_SHR(&func, rt1, rt1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
+         brw_AND(&func, rt2, rY, brw_imm_uw(1)); /* Y & 0b1 */
+         brw_OR(&func, rYp, rt1, rt2);
+         brw_AND(&func, rt1, rY, brw_imm_uw(2)); /* Y & 0b10 */
+         brw_AND(&func, rt2, rX, brw_imm_uw(2)); /* X & 0b10 */
+         brw_SHR(&func, rt2, rt2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
+         brw_OR(&func, rS, rt1, rt2);
          break;
       case 8:
          /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
@@ -1249,24 +1393,29 @@ brw_blorp_blit_program::decode_msaa(unsigned num_samples,
           *         Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
           *         S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
           */
-         brw_AND(&func, t1, X, brw_imm_uw(0xfff8)); /* X & ~0b111 */
-         brw_SHR(&func, t1, t1, brw_imm_uw(2)); /* (X & ~0b111) >> 2 */
-         brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
-         brw_OR(&func, Xp, t1, t2);
-         brw_AND(&func, t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
-         brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
-         brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
-         brw_OR(&func, Yp, t1, t2);
-         brw_AND(&func, t1, X, brw_imm_uw(4)); /* X & 0b100 */
-         brw_AND(&func, t2, Y, brw_imm_uw(2)); /* Y & 0b10 */
-         brw_OR(&func, t1, t1, t2); /* (X & 0b100) | (Y & 0b10) */
-         brw_AND(&func, t2, X, brw_imm_uw(2)); /* X & 0b10 */
-         brw_SHR(&func, t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
-         brw_OR(&func, S, t1, t2);
+         brw_AND(&func, rt1, rX, brw_imm_uw(0xfff8)); /* X & ~0b111 */
+         brw_SHR(&func, rt1, rt1, brw_imm_uw(2)); /* (X & ~0b111) >> 2 */
+         brw_AND(&func, rt2, rX, brw_imm_uw(1)); /* X & 0b1 */
+         brw_OR(&func, rXp, rt1, rt2);
+         brw_AND(&func, rt1, rY, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
+         brw_SHR(&func, rt1, rt1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
+         brw_AND(&func, rt2, rY, brw_imm_uw(1)); /* Y & 0b1 */
+         brw_OR(&func, rYp, rt1, rt2);
+         brw_AND(&func, rt1, rX, brw_imm_uw(4)); /* X & 0b100 */
+         brw_AND(&func, rt2, rY, brw_imm_uw(2)); /* Y & 0b10 */
+         brw_OR(&func, rt1, rt1, rt2); /* (X & 0b100) | (Y & 0b10) */
+         brw_AND(&func, rt2, rX, brw_imm_uw(2)); /* X & 0b10 */
+         brw_SHR(&func, rt2, rt2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
+         brw_OR(&func, rS, rt1, rt2);
          break;
       }
       s_is_zero = false;
-      SWAP_XY_AND_XPYP();
+
+      if (type == BRW_REGISTER_TYPE_UW) {
+         SWAP_XY_AND_XPYP_UW();
+      } else {
+         SWAP_XY_AND_XPYP();
+      }
       break;
    }
 }
@@ -1283,10 +1432,10 @@ brw_blorp_blit_program::kill_if_outside_dst_rect()
    struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
    struct brw_reg null16 = vec16(retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
 
-   brw_CMP(&func, null16, BRW_CONDITIONAL_GE, X, dst_x0);
-   brw_CMP(&func, null16, BRW_CONDITIONAL_GE, Y, dst_y0);
-   brw_CMP(&func, null16, BRW_CONDITIONAL_L, X, dst_x1);
-   brw_CMP(&func, null16, BRW_CONDITIONAL_L, Y, dst_y1);
+   brw_CMP(&func, null16, BRW_CONDITIONAL_GE, Xuw, dst_x0);
+   brw_CMP(&func, null16, BRW_CONDITIONAL_GE, Yuw, dst_y0);
+   brw_CMP(&func, null16, BRW_CONDITIONAL_L, Xuw, dst_x1);
+   brw_CMP(&func, null16, BRW_CONDITIONAL_L, Yuw, dst_y1);
 
    brw_set_predicate_control(&func, BRW_PREDICATE_NONE);
    brw_push_insn_state(&func);
@@ -1302,11 +1451,29 @@ brw_blorp_blit_program::kill_if_outside_dst_rect()
 void
 brw_blorp_blit_program::translate_dst_to_src()
 {
-   brw_MUL(&func, Xp, X, x_transform.multiplier);
-   brw_MUL(&func, Yp, Y, y_transform.multiplier);
-   brw_ADD(&func, Xp, Xp, x_transform.offset);
-   brw_ADD(&func, Yp, Yp, y_transform.offset);
-   SWAP_XY_AND_XPYP();
+   /* To allow scaled blitting convert the uw coordinates to float. */
+   expand_to_32_bits(Xuw, Xf);
+   expand_to_32_bits(Yuw, Yf);
+
+   brw_set_compression_control(&func, BRW_COMPRESSION_COMPRESSED);
+   brw_MUL(&func, Xf_p, Xf, x_transform.multiplier);
+   brw_MUL(&func, Yf_p, Yf, y_transform.multiplier);
+   brw_ADD(&func, Xf_p, Xf_p, x_transform.offset);
+   brw_ADD(&func, Yf_p, Yf_p, y_transform.offset);
+   if (key->blend && key->blit_scaled) {
+      /* Round the coordinates in case of scaled blits */
+      brw_RNDD(&func, Xf, Xf_p);
+      brw_RNDD(&func, Yf, Yf_p);
+      /* Move the float coordinates to UD coordiantes */
+      brw_MOV(&func, X, Xf);
+      brw_MOV(&func, Y, Yf);
+   }
+   else if(key->blend || (key->src_tiled_w && key->dst_tiled_w)) {
+      /* Move the float coordinates to UD coordiantes */
+      brw_MOV(&func, X, Xf_p);
+      brw_MOV(&func, Y, Yf_p);
+   }
+   brw_set_compression_control(&func, BRW_COMPRESSION_NONE);
 }
 
 /**
@@ -1322,11 +1489,10 @@ brw_blorp_blit_program::single_to_blend()
     * that maxe up a pixel).  So we need to multiply our X and Y coordinates
     * each by 2 and then add 1.
     */
-   brw_SHL(&func, t1, X, brw_imm_w(1));
-   brw_SHL(&func, t2, Y, brw_imm_w(1));
-   brw_ADD(&func, Xp, t1, brw_imm_w(1));
-   brw_ADD(&func, Yp, t2, brw_imm_w(1));
-   SWAP_XY_AND_XPYP();
+   brw_SHL(&func, retype(t1, BRW_REGISTER_TYPE_UD), vec8(X), brw_imm_w(1));
+   brw_SHL(&func, retype(t2, BRW_REGISTER_TYPE_UD), vec8(Y), brw_imm_w(1));
+   brw_ADD(&func, X, vec8(retype(t1, BRW_REGISTER_TYPE_UD)), brw_imm_w(1));
+   brw_ADD(&func, Y, vec8(retype(t2, BRW_REGISTER_TYPE_UD)), brw_imm_w(1));
 }
 
 
@@ -1398,6 +1564,8 @@ brw_blorp_blit_program::manual_blend(unsigned num_samples)
          s_is_zero = true;
       } else {
          s_is_zero = false;
+         key->blit_scaled ?
+         brw_MOV(&func, S1, brw_imm_ud(i)) :
          brw_MOV(&func, S, brw_imm_uw(i));
       }
       texel_fetch(texture_data[stack_depth++]);
@@ -1553,9 +1721,21 @@ void
 brw_blorp_blit_program::expand_to_32_bits(struct brw_reg src,
                                           struct brw_reg dst)
 {
+   /* This function currently handles W, UW, D & UD types only */
+   assert(src.type == BRW_REGISTER_TYPE_W  ||
+          src.type == BRW_REGISTER_TYPE_UW ||
+          src.type == BRW_REGISTER_TYPE_D  ||
+          src.type == BRW_REGISTER_TYPE_UD ||
+          src.type == BRW_REGISTER_TYPE_F);
+
    brw_MOV(&func, vec8(dst), vec8(src));
    brw_set_compression_control(&func, BRW_COMPRESSION_2NDHALF);
-   brw_MOV(&func, offset(vec8(dst), 1), suboffset(vec8(src), 8));
+   if (src.type == BRW_REGISTER_TYPE_D  ||
+       src.type == BRW_REGISTER_TYPE_UD ||
+       src.type == BRW_REGISTER_TYPE_F)
+      brw_MOV(&func, offset(vec8(dst), 1), offset(vec8(src), 1));
+   else
+      brw_MOV(&func, offset(vec8(dst), 1), suboffset(vec8(src), 8));
    brw_set_compression_control(&func, BRW_COMPRESSION_NONE);
 }
 
@@ -1567,19 +1747,31 @@ brw_blorp_blit_program::texture_lookup(struct brw_reg dst,
 {
    struct brw_reg mrf =
       retype(vec16(brw_message_reg(base_mrf)), BRW_REGISTER_TYPE_UD);
+   struct brw_reg srcX, srcY;
+
+   if ((key->src_samples != 0 &&
+        (key->blit_scaled || key->blend)) ||
+        (key->src_tiled_w && key->dst_tiled_w)) {
+      srcX = X;
+      srcY = Y;
+   } else {
+      srcX = Xf_p;
+      srcY = Yf_p;
+   }
+
    for (int arg = 0; arg < num_args; ++arg) {
       switch (args[arg]) {
       case SAMPLER_MESSAGE_ARG_U_FLOAT:
-         expand_to_32_bits(X, retype(mrf, BRW_REGISTER_TYPE_F));
+         expand_to_32_bits(srcX, retype(mrf, BRW_REGISTER_TYPE_F));
          break;
       case SAMPLER_MESSAGE_ARG_V_FLOAT:
-         expand_to_32_bits(Y, retype(mrf, BRW_REGISTER_TYPE_F));
+         expand_to_32_bits(srcY, retype(mrf, BRW_REGISTER_TYPE_F));
          break;
       case SAMPLER_MESSAGE_ARG_U_INT:
-         expand_to_32_bits(X, mrf);
+         expand_to_32_bits(srcX, mrf);
          break;
       case SAMPLER_MESSAGE_ARG_V_INT:
-         expand_to_32_bits(Y, mrf);
+         expand_to_32_bits(srcY, mrf);
          break;
       case SAMPLER_MESSAGE_ARG_SI_INT:
          /* Note: on Gen7, this code may be reached with s_is_zero==true
@@ -1589,8 +1781,15 @@ brw_blorp_blit_program::texture_lookup(struct brw_reg dst,
           */
          if (s_is_zero)
             brw_MOV(&func, mrf, brw_imm_ud(0));
-         else
+         else {
+            /* Use different sample_index registers based on:
+             * If dst is a stencil buffer.
+             * If scaled blitting.
+             */
+            (key->dst_tiled_w || key->blit_scaled) ?
+            expand_to_32_bits(S1, mrf) :
             expand_to_32_bits(S, mrf);
+         }
          break;
       case SAMPLER_MESSAGE_ARG_MCS_INT:
          switch (key->tex_layout) {
@@ -1618,9 +1817,9 @@ brw_blorp_blit_program::texture_lookup(struct brw_reg dst,
    }
 
    brw_SAMPLE(&func,
-              retype(dst, BRW_REGISTER_TYPE_UW) /* dest */,
+              retype(dst, BRW_REGISTER_TYPE_F) /* dest */,
               base_mrf /* msg_reg_nr */,
-              brw_message_reg(base_mrf) /* src0 */,
+              retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_F) /* src0 */,
               BRW_BLORP_TEXTURE_BINDING_TABLE_INDEX,
               0 /* sampler */,
               msg_type,
@@ -1633,10 +1832,17 @@ brw_blorp_blit_program::texture_lookup(struct brw_reg dst,
 
 #undef X
 #undef Y
+#undef Xuw
+#undef Yuw
+#undef Xf
+#undef Yf
 #undef U
 #undef V
 #undef S
+#undef S1
+#undef S2
 #undef SWAP_XY_AND_XPYP
+#undef SWAP_XY_AND_XPYP_UW
 
 void
 brw_blorp_blit_program::render_target_write()
@@ -1679,25 +1885,27 @@ brw_blorp_blit_program::render_target_write()
 
 
 void
-brw_blorp_coord_transform_params::setup(GLuint src0, GLuint dst0, GLuint dst1,
+brw_blorp_coord_transform_params::setup(GLuint src0, GLuint src1,
+                                        GLuint dst0, GLuint dst1,
                                         bool mirror)
 {
+   float scale = ((float) (src1 - src0)) / (dst1 - dst0);
    if (!mirror) {
       /* When not mirroring a coordinate (say, X), we need:
-       *   x' - src_x0 = x - dst_x0
+       *   x' - src_x0 = x - dst_x0 + 0.5
        * Therefore:
-       *   x' = 1*x + (src_x0 - dst_x0)
+       *   x' = 1*x + (src_x0 - dst_x0 + 0.5)
        */
-      multiplier = 1;
-      offset = src0 - dst0;
+      multiplier = scale;
+      offset = ((int) (src0 - dst0) + 0.5) * scale;
    } else {
       /* When mirroring X we need:
-       *   x' - src_x0 = dst_x1 - x - 1
+       *   x' - src_x0 = dst_x1 - x - 0.5
        * Therefore:
-       *   x' = -1*x + (src_x0 + dst_x1 - 1)
+       *   x' = -1*x + (src_x0 + dst_x1 - 0.5)
        */
-      multiplier = -1;
-      offset = src0 + dst1 - 1;
+      multiplier = -scale;
+      offset = (src0 + dst1 - 0.5) * scale;
    }
 }
 
@@ -1737,6 +1945,7 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
                                              struct intel_mipmap_tree *dst_mt,
                                              unsigned dst_level, unsigned dst_layer,
                                              GLuint src_x0, GLuint src_y0,
+                                             GLuint src_x1, GLuint src_y1,
                                              GLuint dst_x0, GLuint dst_y0,
                                              GLuint dst_x1, GLuint dst_y1,
                                              bool mirror_x, bool mirror_y)
@@ -1804,6 +2013,11 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
       wm_prog_key.persample_msaa_dispatch = true;
    }
 
+   /* Scaled blitting or not */
+   wm_prog_key.blit_scaled =
+      ((dst_x1 - dst_x0) == (src_x1 - src_x0) &&
+       (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true;
+
    /* The render path must be configured to use the same number of samples as
     * the destination buffer.
     */
@@ -1847,8 +2061,8 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
    y0 = wm_push_consts.dst_y0 = dst_y0;
    x1 = wm_push_consts.dst_x1 = dst_x1;
    y1 = wm_push_consts.dst_y1 = dst_y1;
-   wm_push_consts.x_transform.setup(src_x0, dst_x0, dst_x1, mirror_x);
-   wm_push_consts.y_transform.setup(src_y0, dst_y0, dst_y1, mirror_y);
+   wm_push_consts.x_transform.setup(src_x0, src_x1, dst_x0, dst_x1, mirror_x);
+   wm_push_consts.y_transform.setup(src_y0, src_y1, dst_y0, dst_y1, mirror_y);
 
    if (dst.num_samples <= 1 && dst_mt->num_samples > 1) {
       /* We must expand the rectangle we send through the rendering pipeline,
@@ -1968,7 +2182,8 @@ brw_blorp_blit_params::get_wm_prog(struct brw_context *brw,
    uint32_t prog_offset;
    if (!brw_search_cache(&brw->cache, BRW_BLORP_BLIT_PROG,
                          &this->wm_prog_key, sizeof(this->wm_prog_key),
-                         &prog_offset, prog_data)) {
+                         &prog_offset, prog_data) ||
+       unlikely(INTEL_DEBUG & DEBUG_BLORP)) {
       brw_blorp_blit_program prog(brw, &this->wm_prog_key);
       GLuint program_size;
       const GLuint *program = prog.compile(brw, &program_size);
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index c5f9f1f..ed5770c 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -1295,6 +1295,7 @@ intel_miptree_updownsample(struct intel_context *intel,
                            src, 0 /* level */, 0 /* layer */,
                            dst, 0 /* level */, 0 /* layer */,
                            src_x0, src_y0,
+                           width, height,
                            dst_x0, dst_y0,
                            width, height,
                            false, false /*mirror x, y*/);
@@ -1304,6 +1305,7 @@ intel_miptree_updownsample(struct intel_context *intel,
                               src->stencil_mt, 0 /* level */, 0 /* layer */,
                               dst->stencil_mt, 0 /* level */, 0 /* layer */,
                               src_x0, src_y0,
+                              width, height,
                               dst_x0, dst_y0,
                               width, height,
                               false, false /*mirror x, y*/);
-- 
1.8.1.4



More information about the mesa-dev mailing list