[Mesa-dev] [PATCH v2 4/5] i965/gen6+: Add code to perform blits on the render path ("blorp").

Paul Berry stereotype441 at gmail.com
Thu May 10 11:21:22 PDT 2012


This patch expands the "blorp" component to be able to perform blits
as well as HiZ resolves.  The new blitting code is located in
brw_blorp_blit.cpp.  This includes the necessary fragment shader code
to look up pixels in the source buffer (which is configured as a
texture) and output them to the destination buffer (which is
configured as the render target).

Most of the time the fragment shader code is simple and
straightforward, since it merely has to apply a coordinate offset,
read from the texture, and write to the render target.  However, in
the case of blitting stencil buffers, things are more complicated,
since the GPU stores stencil data using W tiling, and W tiling is not
supported for textures or render targets.  So, we set up the stencil
buffers as Y tiled, and emit fragment shader code that adjusts the
coordinates to account for the difference between W and Y tiling.
Furthermore, since a rectangular region in W tiling does not
necessarily correspond to a rectangular region in Y tiling, we widen
the rectangle primitive to the nearest tile boundary and have the
fragment shader "kill" any pixels that don't fall inside the actual
desired destination rectangle.

All of this is a necessary prerequisite for implementing MSAA, since
we'll need to be able to blit between multisample color, depth, and
stencil buffers and their non-multisampled counterparts, and none of
the existing blitting mechanisms support multisampling.

In addition, the new blitting code should speed up operations where we
previously fell back to software rasterization, such as blitting of
stencil buffers.  The current fallback sequence is: first we try to do
a blit using the hardware blitting engine.  If that fails we try to do
a blit using the render path.  If that also fails then we do the blit
using a meta-op (which may or may not fall back to software
rasterization).

Note that blitting using the render path has some limitations at the
moment: it only supports a few formats, and it doesn't support
clipping or scissoring.  These limitations will be addressed in future
patch series.

v2:
- Add the code that configures the WM program to
  gen{6,7}_emit_wm_config() and gen7_emit_ps_config() rather than
  creating separate ...enable() functions.
- Call intel_prepare_render before determining which miptrees we are
  blitting from/to, because it may cause miptrees to be reallocated.
- Allow the blit to mirror X and/or Y coordinates.
- Disable blorp blits on Gen7 for now, since they aren't working yet.
---
 src/mesa/drivers/dri/i965/Makefile.sources   |    1 +
 src/mesa/drivers/dri/i965/brw_blorp.cpp      |   32 +-
 src/mesa/drivers/dri/i965/brw_blorp.h        |  136 ++++-
 src/mesa/drivers/dri/i965/brw_blorp_blit.cpp |  863 ++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_context.h      |    8 +
 src/mesa/drivers/dri/i965/gen6_blorp.cpp     |  397 +++++++++++-
 src/mesa/drivers/dri/i965/gen7_blorp.cpp     |  311 +++++++++-
 src/mesa/drivers/dri/intel/intel_fbo.c       |    9 +
 8 files changed, 1730 insertions(+), 27 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/brw_blorp_blit.cpp

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index ba28d39..d2b7d0c 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -111,6 +111,7 @@ i965_C_FILES = \
 
 i965_CXX_FILES = \
 	brw_blorp.cpp \
+	brw_blorp_blit.cpp \
 	brw_cubemap_normalize.cpp \
 	brw_fs.cpp \
 	brw_fs_cfg.cpp \
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp
index 95f039f..762d735 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp
@@ -35,6 +35,11 @@ brw_blorp_mip_info::brw_blorp_mip_info()
 {
 }
 
+brw_blorp_surface_info::brw_blorp_surface_info()
+   : map_stencil_as_y_tiled(false)
+{
+}
+
 void
 brw_blorp_mip_info::set(struct intel_mipmap_tree *mt,
                         unsigned int level, unsigned int layer)
@@ -47,6 +52,23 @@ brw_blorp_mip_info::set(struct intel_mipmap_tree *mt,
 }
 
 void
+brw_blorp_surface_info::set(struct intel_mipmap_tree *mt,
+                            unsigned int level, unsigned int layer)
+{
+   brw_blorp_mip_info::set(mt, level, layer);
+
+   if (mt->format == MESA_FORMAT_S8) {
+      /* The miptree is a W-tiled stencil buffer.  Surface states can't be set
+       * up for W tiling, so we'll need to use Y tiling and have the WM
+       * program swizzle the coordinates.
+       */
+      this->map_stencil_as_y_tiled = true;
+   } else {
+      this->map_stencil_as_y_tiled = false;
+   }
+}
+
+void
 brw_blorp_mip_info::get_draw_offsets(uint32_t *draw_x, uint32_t *draw_y) const
 {
    /* Construct a dummy renderbuffer just to extract tile offsets. */
@@ -65,7 +87,8 @@ brw_blorp_params::brw_blorp_params()
      x1(0),
      y1(0),
      depth_format(0),
-     hiz_op(GEN6_HIZ_OP_NONE)
+     hiz_op(GEN6_HIZ_OP_NONE),
+     use_wm_prog(false)
 {
 }
 
@@ -106,3 +129,10 @@ brw_hiz_op_params::brw_hiz_op_params(struct intel_mipmap_tree *mt,
    default:                    assert(0); break;
    }
 }
+
+uint32_t
+brw_hiz_op_params::get_wm_prog(struct brw_context *brw,
+                               brw_blorp_prog_data **prog_data) const
+{
+   return 0;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h
index c1c8334..b6b659d 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -44,13 +44,24 @@ enum gen6_hiz_op {
    GEN6_HIZ_OP_NONE,
 };
 
+
+/**
+ * Binding table indices used by BLORP.
+ */
+enum {
+   BRW_BLORP_TEXTURE_BINDING_TABLE_INDEX,
+   BRW_BLORP_RENDERBUFFER_BINDING_TABLE_INDEX,
+   BRW_BLORP_NUM_BINDING_TABLE_ENTRIES
+};
+
+
 class brw_blorp_mip_info
 {
 public:
    brw_blorp_mip_info();
 
-   void set(struct intel_mipmap_tree *mt,
-            unsigned int level, unsigned int layer);
+   virtual void set(struct intel_mipmap_tree *mt,
+                    unsigned int level, unsigned int layer);
    void get_draw_offsets(uint32_t *draw_x, uint32_t *draw_y) const;
 
    void get_miplevel_dims(uint32_t *width, uint32_t *height) const
@@ -64,11 +75,71 @@ public:
    unsigned int layer;
 };
 
+class brw_blorp_surface_info : public brw_blorp_mip_info
+{
+public:
+   brw_blorp_surface_info();
+
+   virtual void set(struct intel_mipmap_tree *mt,
+                    unsigned int level, unsigned int layer);
+
+   /* Setting this flag indicates that the buffer's contents are W-tiled
+    * stencil data, but the surface state should be set up for Y tiled
+    * MESA_FORMAT_R8 data (this is necessary because surface states don't
+    * support W tiling).
+    *
+    * Since W tiles are 64 pixels wide by 64 pixels high, whereas Y tiles of
+    * MESA_FORMAT_R8 data are 128 pixels wide by 32 pixels high, the width and
+    * pitch stored in the surface state will be multiplied by 2, and the
+    * height will be halved.  Also, since W and Y tiles store their data in a
+    * different order, the width and height will be rounded up to a multiple
+    * of the tile size, to ensure that the WM program can access the full
+    * width and height of the buffer.
+    */
+   bool map_stencil_as_y_tiled;
+};
+
+
+struct brw_blorp_coord_transform_params
+{
+   void setup(GLuint src0, GLuint dst0, GLuint dst1,
+              bool mirror);
+
+   int16_t multiplier;
+   int16_t offset;
+};
+
+
+struct brw_blorp_wm_push_constants
+{
+   uint16_t dst_x0;
+   uint16_t dst_x1;
+   uint16_t dst_y0;
+   uint16_t dst_y1;
+   brw_blorp_coord_transform_params x_transform;
+   brw_blorp_coord_transform_params y_transform;
+
+   /* Pad out to an integral number of registers */
+   uint16_t pad[8];
+};
+
+/* Every 32 bytes of push constant data constitutes one GEN register. */
+const unsigned int BRW_BLORP_NUM_PUSH_CONST_REGS =
+   sizeof(brw_blorp_wm_push_constants) / 32;
+
+struct brw_blorp_prog_data
+{
+   unsigned int first_curbe_grf;
+};
+
 class brw_blorp_params
 {
 public:
    brw_blorp_params();
 
+   virtual uint32_t get_wm_prog(struct brw_context *brw,
+                                brw_blorp_prog_data **prog_data) const = 0;
+
    void exec(struct intel_context *intel) const;
 
    uint32_t x0;
@@ -77,7 +148,11 @@ public:
    uint32_t y1;
    brw_blorp_mip_info depth;
    uint32_t depth_format;
+   brw_blorp_surface_info src;
+   brw_blorp_surface_info dst;
    enum gen6_hiz_op hiz_op;
+   bool use_wm_prog;
+   brw_blorp_wm_push_constants wm_push_consts;
 };
 
 /**
@@ -95,6 +170,45 @@ public:
    brw_hiz_op_params(struct intel_mipmap_tree *mt,
                      unsigned int level, unsigned int layer,
                      gen6_hiz_op op);
+
+   virtual uint32_t get_wm_prog(struct brw_context *brw,
+                                brw_blorp_prog_data **prog_data) const;
+};
+
+struct brw_blorp_blit_prog_key
+{
+   /* True if the source image is W tiled.  If true, the surface state for the
+    * source image must be configured as Y tiled.
+    */
+   bool src_tiled_w;
+
+   /* True if the destination image is W tiled.  If true, the surface state
+    * for the render target must be configured as Y tiled.
+    */
+   bool dst_tiled_w;
+
+   /* True if the rectangle being sent through the rendering pipeline might be
+    * larger than the destination rectangle, so the WM program should kill any
+    * pixels that are outside the destination rectangle.
+    */
+   bool use_kill;
+};
+
+class brw_blorp_blit_params : public brw_blorp_params
+{
+public:
+   brw_blorp_blit_params(struct intel_mipmap_tree *src_mt,
+                         struct intel_mipmap_tree *dst_mt,
+                         GLuint src_x0, GLuint src_y0,
+                         GLuint dst_x0, GLuint dst_y0,
+                         GLuint width, GLuint height,
+                         bool mirror_x, bool mirror_y);
+
+   virtual uint32_t get_wm_prog(struct brw_context *brw,
+                                brw_blorp_prog_data **prog_data) const;
+
+private:
+   brw_blorp_blit_prog_key wm_prog_key;
 };
 
 /**
@@ -119,11 +233,29 @@ void
 gen6_blorp_emit_vertices(struct brw_context *brw,
                          const brw_blorp_params *params);
 
+uint32_t
+gen6_blorp_emit_blend_state(struct brw_context *brw,
+                            const brw_blorp_params *params);
+
+uint32_t
+gen6_blorp_emit_cc_state(struct brw_context *brw,
+                         const brw_blorp_params *params);
+
+uint32_t
+gen6_blorp_emit_wm_constants(struct brw_context *brw,
+                             const brw_blorp_params *params);
+
 void
 gen6_blorp_emit_vs_disable(struct brw_context *brw,
                            const brw_blorp_params *params);
 
 uint32_t
+gen6_blorp_emit_binding_table(struct brw_context *brw,
+                              const brw_blorp_params *params,
+                              uint32_t wm_surf_offset_renderbuffer,
+                              uint32_t wm_surf_offset_texture);
+
+uint32_t
 gen6_blorp_emit_depth_stencil_state(struct brw_context *brw,
                                     const brw_blorp_params *params);
 
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
new file mode 100644
index 0000000..cce5d1b
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -0,0 +1,863 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "main/teximage.h"
+
+#include "glsl/ralloc.h"
+
+#include "intel_fbo.h"
+
+#include "brw_blorp.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_state.h"
+
+
+/**
+ * Helper function for handling mirror image blits.
+ *
+ * If coord0 > coord1, swap them and invert the "mirror" boolean.
+ */
+static inline void
+fixup_mirroring(bool &mirror, GLint &coord0, GLint &coord1)
+{
+   if (coord0 > coord1) {
+      mirror = !mirror;
+      GLint tmp = coord0;
+      coord0 = coord1;
+      coord1 = tmp;
+   }
+}
+
+
+static bool
+try_blorp_blit(struct intel_context *intel,
+               GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+               GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+               GLenum filter, GLbitfield buffer_bit)
+{
+   struct gl_context *ctx = &intel->ctx;
+
+   /* Sync up the state of window system buffers.  We need to do this before
+    * we go looking for the buffers.
+    */
+   intel_prepare_render(intel);
+
+   /* Find buffers */
+   const struct gl_framebuffer *read_fb = ctx->ReadBuffer;
+   const struct gl_framebuffer *draw_fb = ctx->DrawBuffer;
+   struct gl_renderbuffer *src_rb;
+   struct gl_renderbuffer *dst_rb;
+   switch (buffer_bit) {
+   case GL_COLOR_BUFFER_BIT:
+      src_rb = read_fb->_ColorReadBuffer;
+      dst_rb =
+         draw_fb->Attachment[
+            draw_fb->_ColorDrawBufferIndexes[0]].Renderbuffer;
+      break;
+   case GL_DEPTH_BUFFER_BIT:
+      src_rb = read_fb->Attachment[BUFFER_DEPTH].Renderbuffer;
+      dst_rb = draw_fb->Attachment[BUFFER_DEPTH].Renderbuffer;
+      break;
+   case GL_STENCIL_BUFFER_BIT:
+      src_rb = read_fb->Attachment[BUFFER_STENCIL].Renderbuffer;
+      dst_rb = draw_fb->Attachment[BUFFER_STENCIL].Renderbuffer;
+      break;
+   default:
+      assert(false);
+   }
+
+   /* Validate source */
+   if (!src_rb) return false;
+   struct intel_renderbuffer *src_irb = intel_renderbuffer(src_rb);
+   struct intel_mipmap_tree *src_mt = src_irb->mt;
+   if (!src_mt) return false;
+   if (buffer_bit == GL_STENCIL_BUFFER_BIT && src_mt->stencil_mt)
+      src_mt = src_mt->stencil_mt;
+   switch (src_mt->format) {
+   case MESA_FORMAT_ARGB8888:
+   case MESA_FORMAT_X8_Z24:
+   case MESA_FORMAT_S8:
+      break; /* Supported */
+   default:
+      /* Unsupported format.
+       *
+       * TODO: need to support all formats that are allowed as multisample
+       * render targets.
+       */
+      return false;
+   }
+
+   /* Validate destination */
+   if (!dst_rb) return false;
+   struct intel_renderbuffer *dst_irb = intel_renderbuffer(dst_rb);
+   struct intel_mipmap_tree *dst_mt = dst_irb->mt;
+   if (!dst_mt) return false;
+   if (buffer_bit == GL_STENCIL_BUFFER_BIT && dst_mt->stencil_mt)
+      dst_mt = dst_mt->stencil_mt;
+   switch (dst_mt->format) {
+   case MESA_FORMAT_ARGB8888:
+   case MESA_FORMAT_X8_Z24:
+   case MESA_FORMAT_S8:
+      break; /* Supported */
+   default:
+      /* Unsupported format.
+       *
+       * TODO: need to support all formats that are allowed as multisample
+       * render targets.
+       */
+      return false;
+   }
+
+   /* Account for the fact that in the system framebuffer, the origin is at
+    * the lower left.
+    */
+   if (read_fb->Name == 0) {
+      srcY0 = read_fb->Height - srcY0;
+      srcY1 = read_fb->Height - srcY1;
+   }
+   if (draw_fb->Name == 0) {
+      dstY0 = draw_fb->Height - dstY0;
+      dstY1 = draw_fb->Height - dstY1;
+   }
+
+   /* Detect if the blit needs to be mirrored */
+   bool mirror_x = false, mirror_y = false;
+   fixup_mirroring(mirror_x, srcX0, srcX1);
+   fixup_mirroring(mirror_x, dstX0, dstX1);
+   fixup_mirroring(mirror_y, srcY0, srcY1);
+   fixup_mirroring(mirror_y, dstY0, dstY1);
+
+   /* Make sure width and height match */
+   GLsizei width = srcX1 - srcX0;
+   GLsizei height = srcY1 - srcY0;
+   if (width != dstX1 - dstX0) return false;
+   if (height != dstY1 - dstY0) return false;
+
+   /* Make sure width and height don't need to be clipped or scissored.
+    * TODO: support clipping and scissoring.
+    */
+   if (srcX0 < 0 || (GLuint) srcX1 > read_fb->Width) return false;
+   if (srcY0 < 0 || (GLuint) srcY1 > read_fb->Height) return false;
+   if (dstX0 < 0 || (GLuint) dstX1 > draw_fb->Width) return false;
+   if (dstY0 < 0 || (GLuint) dstY1 > draw_fb->Height) return false;
+   if (ctx->Scissor.Enabled) return false;
+
+   /* Get ready to blit.  This includes depth resolving the src and dst
+    * buffers if necessary.
+    */
+   intel_renderbuffer_resolve_depth(intel, src_irb);
+   intel_renderbuffer_resolve_depth(intel, dst_irb);
+
+   /* Do the blit */
+   brw_blorp_blit_params params(src_mt, dst_mt,
+                                srcX0, srcY0, dstX0, dstY0, dstX1, dstY1,
+                                mirror_x, mirror_y);
+   params.exec(intel);
+
+   /* Mark the dst buffer as needing a HiZ resolve if necessary. */
+   intel_renderbuffer_set_needs_hiz_resolve(dst_irb);
+
+   return true;
+}
+
+GLbitfield
+brw_blorp_framebuffer(struct intel_context *intel,
+                      GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                      GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                      GLbitfield mask, GLenum filter)
+{
+   /* BLORP is only supported on Gen6.  TODO: implement on Gen7. */
+   if (intel->gen != 6)
+      return mask;
+
+   static GLbitfield buffer_bits[] = {
+      GL_COLOR_BUFFER_BIT,
+      GL_DEPTH_BUFFER_BIT,
+      GL_STENCIL_BUFFER_BIT,
+   };
+
+   for (unsigned int i = 0; i < ARRAY_SIZE(buffer_bits); ++i) {
+      if ((mask & buffer_bits[i]) &&
+       try_blorp_blit(intel,
+                      srcX0, srcY0, srcX1, srcY1,
+                      dstX0, dstY0, dstX1, dstY1,
+                      filter, buffer_bits[i])) {
+         mask &= ~buffer_bits[i];
+      }
+   }
+
+   return mask;
+}
+
+/**
+ * Generator for WM programs used in BLORP blits.
+ *
+ * The bulk of the work done by the WM program is to wrap and unwrap the
+ * coordinate transformations used by the hardware to store surfaces in
+ * memory.  The hardware transforms a pixel location (X, Y) to a memory offset
+ * by the following formulas:
+ *
+ *   offset = tile(tiling_format, X, Y)
+ *   (X, Y) = detile(tiling_format, offset)
+ *
+ * For X tiling, tile() combines together the low-order bits of the X and Y
+ * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
+ * bytes wide and 8 rows high:
+ *
+ *   tile(x_tiled, X, Y) = A
+ *     where A = tile_num << 12 | offset
+ *           tile_num = (Y >> 3) * tile_pitch + (X' >> 9)
+ *           offset = (Y & 0b111) << 9
+ *                    | (X & 0b111111111)
+ *           X' = X * cpp
+ *   detile(x_tiled, A) = (X, Y)
+ *     where X = X' / cpp
+ *           Y = (tile_num / tile_pitch) << 3
+ *               | (A & 0b111000000000) >> 9
+ *           X' = (tile_num % tile_pitch) << 9
+ *                | (A & 0b111111111)
+ *
+ * (In all tiling formulas, cpp is the number of bytes occupied by a single
+ * pixel ("chars per pixel"), and tile_pitch is the number of 4k tiles
+ * required to fill the width of the surface).
+ *
+ * For Y tiling, tile() combines together the low-order bits of the X and Y
+ * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
+ * bytes wide and 32 rows high:
+ *
+ *   tile(y_tiled, X, Y) = A
+ *     where A = tile_num << 12 | offset
+ *           tile_num = (Y >> 5) * tile_pitch + (X' >> 7)
+ *           offset = (X' & 0b1110000) << 5
+ *                    | (Y' & 0b11111) << 4
+ *                    | (X' & 0b1111)
+ *           X' = X * cpp
+ *   detile(y_tiled, A) = (X, Y)
+ *     where X = X' / cpp
+ *           Y = (tile_num / tile_pitch) << 5
+ *               | (A & 0b111110000) >> 4
+ *           X' = (tile_num % tile_pitch) << 7
+ *                | (A & 0b111000000000) >> 5
+ *                | (A & 0b1111)
+ *
+ * For W tiling, tile() combines together the low-order bits of the X and Y
+ * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
+ * bytes wide and 64 rows high (note that W tiling is only used for stencil
+ * buffers, which always have cpp = 1):
+ *
+ *   tile(w_tiled, X, Y) = A
+ *     where A = tile_num << 12 | offset
+ *           tile_num = (Y >> 6) * tile_pitch + (X' >> 6)
+ *           offset = (X' & 0b111000) << 6
+ *                    | (Y & 0b111100) << 3
+ *                    | (X' & 0b100) << 2
+ *                    | (Y & 0b10) << 2
+ *                    | (X' & 0b10) << 1
+ *                    | (Y & 0b1) << 1
+ *                    | (X' & 0b1)
+ *           X' = X * cpp = X
+ *   detile(w_tiled, A) = (X, Y)
+ *     where X = X' / cpp = X'
+ *           Y = (tile_num / tile_pitch) << 6
+ *               | (A & 0b111100000) >> 3
+ *               | (A & 0b1000) >> 2
+ *               | (A & 0b10) >> 1
+ *           X' = (tile_num % tile_pitch) << 6
+ *                | (A & 0b111000000000) >> 6
+ *                | (A & 0b10000) >> 2
+ *                | (A & 0b100) >> 1
+ *                | (A & 0b1)
+ *
+ * Finally, for a non-tiled surface, tile() simply combines together the X and
+ * Y coordinates in the natural way:
+ *
+ *   tile(untiled, X, Y) = A
+ *     where A = Y * pitch + X'
+ *           X' = X * cpp
+ *   detile(untiled, A) = (X, Y)
+ *     where X = X' / cpp
+ *           Y = A / pitch
+ *           X' = A % pitch
+ *
+ * (In these formulas, pitch is the number of bytes occupied by a single row
+ * of pixels).
+ */
+class brw_blorp_blit_program
+{
+public:
+   brw_blorp_blit_program(struct brw_context *brw,
+                          const brw_blorp_blit_prog_key *key);
+   ~brw_blorp_blit_program();
+
+   const GLuint *compile(struct brw_context *brw, GLuint *program_size);
+
+   brw_blorp_prog_data prog_data;
+
+private:
+   void alloc_regs();
+   void alloc_push_const_regs(int base_reg);
+   void compute_frag_coords();
+   void translate_tiling(bool old_tiled_w, bool new_tiled_w);
+   void kill_if_outside_dst_rect();
+   void translate_dst_to_src();
+   void texel_fetch();
+   void texture_lookup(GLuint msg_type,
+                       struct brw_reg mrf_u, struct brw_reg mrf_v);
+   void render_target_write();
+
+   void *mem_ctx;
+   struct brw_context *brw;
+   const brw_blorp_blit_prog_key *key;
+   struct brw_compile func;
+
+   /* Thread dispatch header */
+   struct brw_reg R0;
+
+   /* Pixel X/Y coordinates (always in R1). */
+   struct brw_reg R1;
+
+   /* Push constants */
+   struct brw_reg dst_x0;
+   struct brw_reg dst_x1;
+   struct brw_reg dst_y0;
+   struct brw_reg dst_y1;
+   struct {
+      struct brw_reg multiplier;
+      struct brw_reg offset;
+   } x_transform, y_transform;
+
+   /* Data returned from texture lookup (4 vec16's) */
+   struct brw_reg Rdata;
+
+   /* X coordinates.  We have two of them so that we can perform coordinate
+    * transformations easily.
+    */
+   struct brw_reg x_coords[2];
+
+   /* Y coordinates.  We have two of them so that we can perform coordinate
+    * transformations easily.
+    */
+   struct brw_reg y_coords[2];
+
+   /* Which element of x_coords and y_coords is currently in use.
+    */
+   int xy_coord_index;
+
+   /* Temporaries */
+   struct brw_reg t1;
+   struct brw_reg t2;
+
+   /* M2-3: u coordinate */
+   GLuint base_mrf;
+   struct brw_reg mrf_u_float;
+
+   /* M4-5: v coordinate */
+   struct brw_reg mrf_v_float;
+};
+
+brw_blorp_blit_program::brw_blorp_blit_program(
+      struct brw_context *brw,
+      const brw_blorp_blit_prog_key *key)
+   : mem_ctx(ralloc_context(NULL)),
+     brw(brw),
+     key(key)
+{
+   brw_init_compile(brw, &func, mem_ctx);
+}
+
+brw_blorp_blit_program::~brw_blorp_blit_program()
+{
+   ralloc_free(mem_ctx);
+}
+
+const GLuint *
+brw_blorp_blit_program::compile(struct brw_context *brw,
+                                GLuint *program_size)
+{
+   brw_set_compression_control(&func, BRW_COMPRESSION_NONE);
+
+   alloc_regs();
+   compute_frag_coords();
+
+   /* Render target and texture hardware don't support W tiling. */
+   const bool rt_tiled_w = false;
+   const bool tex_tiled_w = false;
+
+   /* The address that data will be written to is determined by the
+    * coordinates supplied to the WM thread and the tiling of the render
+    * target, according to the formula:
+    *
+    * (X, Y) = detile(rt_tiling, offset)
+    *
+    * If the actual tiling of the destination surface is not the same as the
+    * configuration of the render target, then these coordinates are wrong and
+    * we have to adjust them to compensate for the difference.
+    */
+   if (rt_tiled_w != key->dst_tiled_w)
+      translate_tiling(rt_tiled_w, key->dst_tiled_w);
+
+   /* Now (X, Y) = detile(dst_tiling, offset).
+    *
+    * That is: X and Y now contain the true coordinates of the data that the
+    * WM thread should output.
+    *
+    * If we need to kill pixels that are outside the destination rectangle,
+    * now is the time to do it.
+    */
+
+   if (key->use_kill)
+      kill_if_outside_dst_rect();
+
+   /* Next, apply a translation to obtain coordinates in the source image. */
+   translate_dst_to_src();
+
+   /* X and Y are now the coordinates of the pixel in the source image that we
+    * want to texture from.
+    *
+    * The address that we want to fetch from is
+    * related to the X and Y values according to the formula:
+    *
+    * (X, Y) = detile(src_tiling, offset).
+    *
+    * If the actual tiling of the source surface is not the same as the
+    * configuration of the texture, then we need to adjust the coordinates to
+    * compensate for the difference.
+    */
+   if (tex_tiled_w != key->src_tiled_w)
+      translate_tiling(key->src_tiled_w, tex_tiled_w);
+
+   /* Now (X, Y) = detile(tex_tiling, offset).
+    *
+    * In other words: X and Y now contain values which, when passed to
+    * the texturing unit, will cause data to be read from the correct
+    * memory location.  So we can fetch the texel now.
+    */
+   texel_fetch();
+
+   /* Finally, write the fetched value to the render target and terminate the
+    * thread.
+    */
+   render_target_write();
+   return brw_get_program(&func, program_size);
+}
+
+void
+brw_blorp_blit_program::alloc_push_const_regs(int base_reg)
+{
+#define CONST_LOC(name) offsetof(brw_blorp_wm_push_constants, name)
+#define ALLOC_REG(name) \
+   this->name = \
+      brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, base_reg, CONST_LOC(name) / 2)
+
+   ALLOC_REG(dst_x0);
+   ALLOC_REG(dst_x1);
+   ALLOC_REG(dst_y0);
+   ALLOC_REG(dst_y1);
+   ALLOC_REG(x_transform.multiplier);
+   ALLOC_REG(x_transform.offset);
+   ALLOC_REG(y_transform.multiplier);
+   ALLOC_REG(y_transform.offset);
+#undef CONST_LOC
+#undef ALLOC_REG
+}
+
+void
+brw_blorp_blit_program::alloc_regs()
+{
+   int reg = 0;
+   this->R0 = retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW);
+   this->R1 = retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW);
+   prog_data.first_curbe_grf = reg;
+   alloc_push_const_regs(reg);
+   reg += BRW_BLORP_NUM_PUSH_CONST_REGS;
+   this->Rdata = vec16(brw_vec8_grf(reg, 0)); reg += 8;
+   for (int i = 0; i < 2; ++i) {
+      this->x_coords[i]
+         = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
+      this->y_coords[i]
+         = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
+   }
+   this->xy_coord_index = 0;
+   this->t1 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
+   this->t2 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
+
+   int mrf = 2;
+   this->base_mrf = mrf;
+   this->mrf_u_float = vec16(brw_message_reg(mrf)); mrf += 2;
+   this->mrf_v_float = vec16(brw_message_reg(mrf)); mrf += 2;
+}
+
+/* In the code that follows, X and Y can be used to quickly refer to the
+ * active elements of x_coords and y_coords, and Xp and Yp ("X prime" and "Y
+ * prime") to the inactive elements.
+ */
+#define X x_coords[xy_coord_index]
+#define Y y_coords[xy_coord_index]
+#define Xp x_coords[!xy_coord_index]
+#define Yp y_coords[!xy_coord_index]
+
+/* Quickly swap the roles of (X, Y) and (Xp, Yp).  Saves us from having to do
+ * MOVs to transfor (Xp, Yp) to (X, Y) after a coordinate transformation.
+ */
+#define SWAP_XY_AND_XPYP() xy_coord_index = !xy_coord_index;
+
+/**
+ * Emit code to compute the X and Y coordinates of the pixels being rendered
+ * by this WM invocation.
+ *
+ * Assuming the render target is set up for Y tiling, these (X, Y) values are
+ * related to the address offset where outputs will be written by the formula:
+ *
+ *   (X, Y, S) = decode_msaa(detile(offset)).
+ *
+ * (See brw_blorp_blit_program).
+ */
+void
+brw_blorp_blit_program::compute_frag_coords()
+{
+   /* R1.2[15:0] = X coordinate of upper left pixel of subspan 0 (pixel 0)
+    * R1.3[15:0] = X coordinate of upper left pixel of subspan 1 (pixel 4)
+    * R1.4[15:0] = X coordinate of upper left pixel of subspan 2 (pixel 8)
+    * R1.5[15:0] = X coordinate of upper left pixel of subspan 3 (pixel 12)
+    *
+    * Pixels within a subspan are laid out in this arrangement:
+    * 0 1
+    * 2 3
+    *
+    * So, to compute the coordinates of each pixel, we need to read every 2nd
+    * 16-bit value (vstride=2) from R1, starting at the 4th 16-bit value
+    * (suboffset=4), and duplicate each value 4 times (hstride=0, width=4).
+    * In other words, the data we want to access is R1.4<2;4,0>UW.
+    *
+    * Then, we need to add the repeating sequence (0, 1, 0, 1, ...) to the
+    * result, since pixels n+1 and n+3 are in the right half of the subspan.
+    */
+   brw_ADD(&func, X, stride(suboffset(R1, 4), 2, 4, 0), brw_imm_v(0x10101010));
+
+   /* Similarly, Y coordinates for subspans come from R1.2[31:16] through
+    * R1.5[31:16], so to get pixel Y coordinates we need to start at the 5th
+    * 16-bit value instead of the 4th (R1.5<2;4,0>UW instead of
+    * R1.4<2;4,0>UW).
+    *
+    * And we need to add the repeating sequence (0, 0, 1, 1, ...), since
+    * pixels n+2 and n+3 are in the bottom half of the subspan.
+    */
+   brw_ADD(&func, Y, stride(suboffset(R1, 5), 2, 4, 0), brw_imm_v(0x11001100));
+}
+
+/**
+ * Emit code to compensate for the difference between Y and W tiling.
+ *
+ * This code modifies the X and Y coordinates according to the formula:
+ *
+ *   (X', Y') = detile(new_tiling, tile(old_tiling, X, Y))
+ *
+ * (See brw_blorp_blit_program).
+ *
+ * It can only translate between W and Y tiling, so new_tiling and old_tiling
+ * are booleans where true represents W tiling and false represents Y tiling.
+ */
+void
+brw_blorp_blit_program::translate_tiling(bool old_tiled_w, bool new_tiled_w)
+{
+   if (old_tiled_w == new_tiled_w)
+      return;
+
+   if (new_tiled_w) {
+      /* Given X and Y coordinates that describe an address using Y tiling,
+       * translate to the X and Y coordinates that describe the same address
+       * using W tiling.
+       *
+       * If we break down the low order bits of X and Y, using a
+       * single letter to represent each low-order bit:
+       *
+       *   X = A << 7 | 0bBCDEFGH
+       *   Y = J << 5 | 0bKLMNP                                       (1)
+       *
+       * Then we can apply the Y tiling formula to see the memory offset being
+       * addressed:
+       *
+       *   offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH       (2)
+       *
+       * If we apply the W detiling formula to this memory location, that the
+       * corresponding X' and Y' coordinates are:
+       *
+       *   X' = A << 6 | 0bBCDPFH                                     (3)
+       *   Y' = J << 6 | 0bKLMNEG
+       *
+       * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
+       * we need to make the following computation:
+       *
+       *   X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1         (4)
+       *   Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
+       */
+      brw_AND(&func, t1, X, brw_imm_uw(0xfff4)); /* X & ~0b1011 */
+      brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b1011) >> 1 */
+      brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
+      brw_SHL(&func, t2, t2, brw_imm_uw(2)); /* (Y & 0b1) << 2 */
+      brw_OR(&func, t1, t1, t2); /* (X & ~0b1011) >> 1 | (Y & 0b1) << 2 */
+      brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
+      brw_OR(&func, Xp, t1, t2);
+      brw_AND(&func, t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
+      brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
+      brw_AND(&func, t2, X, brw_imm_uw(8)); /* X & 0b1000 */
+      brw_SHR(&func, t2, t2, brw_imm_uw(2)); /* (X & 0b1000) >> 2 */
+      brw_OR(&func, t1, t1, t2); /* (Y & ~0b1) << 1 | (X & 0b1000) >> 2 */
+      brw_AND(&func, t2, X, brw_imm_uw(2)); /* X & 0b10 */
+      brw_SHR(&func, t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
+      brw_OR(&func, Yp, t1, t2);
+      SWAP_XY_AND_XPYP();
+   } else {
+      /* Applying the same logic as above, but in reverse, we obtain the
+       * formulas:
+       *
+       * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
+       * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
+       */
+      brw_AND(&func, t1, X, brw_imm_uw(0xfffa)); /* X & ~0b101 */
+      brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b101) << 1 */
+      brw_AND(&func, t2, Y, brw_imm_uw(2)); /* Y & 0b10 */
+      brw_SHL(&func, t2, t2, brw_imm_uw(2)); /* (Y & 0b10) << 2 */
+      brw_OR(&func, t1, t1, t2); /* (X & ~0b101) << 1 | (Y & 0b10) << 2 */
+      brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
+      brw_SHL(&func, t2, t2, brw_imm_uw(1)); /* (Y & 0b1) << 1 */
+      brw_OR(&func, t1, t1, t2); /* (X & ~0b101) << 1 | (Y & 0b10) << 2
+                                    | (Y & 0b1) << 1 */
+      brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
+      brw_OR(&func, Xp, t1, t2);
+      brw_AND(&func, t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
+      brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
+      brw_AND(&func, t2, X, brw_imm_uw(4)); /* X & 0b100 */
+      brw_SHR(&func, t2, t2, brw_imm_uw(2)); /* (X & 0b100) >> 2 */
+      brw_OR(&func, Yp, t1, t2);
+      SWAP_XY_AND_XPYP();
+   }
+}
+
+/**
+ * Emit code that kills pixels whose X and Y coordinates are outside the
+ * boundary of the rectangle defined by the push constants (dst_x0, dst_y0,
+ * dst_x1, dst_y1).
+ */
+void
+brw_blorp_blit_program::kill_if_outside_dst_rect()
+{
+   struct brw_reg f0 = brw_flag_reg();
+   struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+   struct brw_reg null16 = vec16(retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
+
+   brw_CMP(&func, null16, BRW_CONDITIONAL_GE, X, dst_x0);
+   brw_CMP(&func, null16, BRW_CONDITIONAL_GE, Y, dst_y0);
+   brw_CMP(&func, null16, BRW_CONDITIONAL_L, X, dst_x1);
+   brw_CMP(&func, null16, BRW_CONDITIONAL_L, Y, dst_y1);
+
+   brw_set_predicate_control(&func, BRW_PREDICATE_NONE);
+   brw_push_insn_state(&func);
+   brw_set_mask_control(&func, BRW_MASK_DISABLE);
+   brw_AND(&func, g1, f0, g1);
+   brw_pop_insn_state(&func);
+}
+
+/**
+ * Emit code to translate from destination (X, Y) coordinates to source (X, Y)
+ * coordinates.
+ */
+void
+brw_blorp_blit_program::translate_dst_to_src()
+{
+   brw_MUL(&func, Xp, X, x_transform.multiplier);
+   brw_MUL(&func, Yp, Y, y_transform.multiplier);
+   brw_ADD(&func, Xp, Xp, x_transform.offset);
+   brw_ADD(&func, Yp, Yp, y_transform.offset);
+   SWAP_XY_AND_XPYP();
+}
+
+/**
+ * Emit code to look up a value in the texture using the SAMPLE_LD message
+ * (which does a simple texel fetch).
+ */
+void
+brw_blorp_blit_program::texel_fetch()
+{
+   texture_lookup(GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                  retype(mrf_u_float, BRW_REGISTER_TYPE_UD),
+                  retype(mrf_v_float, BRW_REGISTER_TYPE_UD));
+}
+
+void
+brw_blorp_blit_program::texture_lookup(GLuint msg_type,
+                                       struct brw_reg mrf_u,
+                                       struct brw_reg mrf_v)
+{
+   /* Expand X and Y coordinates from 16 bits to 32 bits. */
+   brw_MOV(&func, vec8(mrf_u), vec8(X));
+   brw_set_compression_control(&func, BRW_COMPRESSION_2NDHALF);
+   brw_MOV(&func, offset(vec8(mrf_u), 1), suboffset(vec8(X), 8));
+   brw_set_compression_control(&func, BRW_COMPRESSION_NONE);
+   brw_MOV(&func, vec8(mrf_v), vec8(Y));
+   brw_set_compression_control(&func, BRW_COMPRESSION_2NDHALF);
+   brw_MOV(&func, offset(vec8(mrf_v), 1), suboffset(vec8(Y), 8));
+   brw_set_compression_control(&func, BRW_COMPRESSION_NONE);
+
+   brw_SAMPLE(&func,
+              retype(Rdata, BRW_REGISTER_TYPE_UW) /* dest */,
+              base_mrf /* msg_reg_nr */,
+              vec8(mrf_u) /* src0 */,
+              BRW_BLORP_TEXTURE_BINDING_TABLE_INDEX,
+              0 /* sampler -- ignored for SAMPLE_LD message */,
+              WRITEMASK_XYZW,
+              msg_type,
+              8 /* response_length.  TODO: should be smaller for non-RGBA formats? */,
+              4 /* msg_length */,
+              0 /* header_present */,
+              BRW_SAMPLER_SIMD_MODE_SIMD16,
+              BRW_SAMPLER_RETURN_FORMAT_FLOAT32);
+}
+
+#undef X
+#undef Y
+#undef U
+#undef V
+#undef S
+#undef SWAP_XY_AND_XPYP
+
+void
+brw_blorp_blit_program::render_target_write()
+{
+   struct brw_reg mrf_rt_write = vec16(brw_message_reg(base_mrf));
+   int mrf_offset = 0;
+
+   /* If we may have killed pixels, then we need to send R0 and R1 in a header
+    * so that the render target knows which pixels we killed.
+    */
+   bool use_header = key->use_kill;
+   if (use_header) {
+      /* Copy R0/1 to MRF */
+      brw_MOV(&func, retype(mrf_rt_write, BRW_REGISTER_TYPE_UD),
+              retype(R0, BRW_REGISTER_TYPE_UD));
+      mrf_offset += 2;
+   }
+
+   /* Copy texture data to MRFs */
+   for (int i = 0; i < 4; ++i) {
+      /* E.g. mov(16) m2.0<1>:f r2.0<8;8,1>:f { Align1, H1 } */
+      brw_MOV(&func, offset(mrf_rt_write, mrf_offset), offset(vec8(Rdata), 2*i));
+      mrf_offset += 2;
+   }
+
+   /* Now write to the render target and terminate the thread */
+   brw_fb_WRITE(&func,
+                16 /* dispatch_width */,
+                base_mrf /* msg_reg_nr */,
+                mrf_rt_write /* src0 */,
+                BRW_BLORP_RENDERBUFFER_BINDING_TABLE_INDEX,
+                mrf_offset /* msg_length.  TODO: Should be smaller for non-RGBA formats. */,
+                0 /* response_length */,
+                true /* eot */,
+                use_header);
+}
+
+
+void
+brw_blorp_coord_transform_params::setup(GLuint src0, GLuint dst0, GLuint dst1,
+                                        bool mirror)
+{
+   if (!mirror) {
+      /* When not mirroring a coordinate (say, X), we need:
+       *   x' - src_x0 = x - dst_x0
+       * Therefore:
+       *   x' = 1*x + (src_x0 - dst_x0)
+       */
+      multiplier = 1;
+      offset = src0 - dst0;
+   } else {
+      /* When mirroring X we need:
+       *   x' - src_x0 = dst_x1 - x - 1
+       * Therefore:
+       *   x' = -1*x + (src_x0 + dst_x1 - 1)
+       */
+      multiplier = -1;
+      offset = src0 + dst1 - 1;
+   }
+}
+
+
+brw_blorp_blit_params::brw_blorp_blit_params(struct intel_mipmap_tree *src_mt,
+                                             struct intel_mipmap_tree *dst_mt,
+                                             GLuint src_x0, GLuint src_y0,
+                                             GLuint dst_x0, GLuint dst_y0,
+                                             GLuint dst_x1, GLuint dst_y1,
+                                             bool mirror_x, bool mirror_y)
+{
+   src.set(src_mt, 0, 0);
+   dst.set(dst_mt, 0, 0);
+
+   use_wm_prog = true;
+   memset(&wm_prog_key, 0, sizeof(wm_prog_key));
+
+   wm_prog_key.src_tiled_w = src.map_stencil_as_y_tiled;
+   wm_prog_key.dst_tiled_w = dst.map_stencil_as_y_tiled;
+   x0 = wm_push_consts.dst_x0 = dst_x0;
+   y0 = wm_push_consts.dst_y0 = dst_y0;
+   x1 = wm_push_consts.dst_x1 = dst_x1;
+   y1 = wm_push_consts.dst_y1 = dst_y1;
+   wm_push_consts.x_transform.setup(src_x0, dst_x0, dst_x1, mirror_x);
+   wm_push_consts.y_transform.setup(src_y0, dst_y0, dst_y1, mirror_y);
+
+   if (dst.map_stencil_as_y_tiled) {
+      /* We must modify the rectangle we send through the rendering pipeline,
+       * to account for the fact that we are mapping it as Y-tiled when it is
+       * in fact W-tiled.  Y tiles have dimensions 128x32 whereas W tiles have
+       * dimensions 64x64.  We must also align it to a multiple of the tile
+       * size, because the differences between W and Y tiling formats will
+       * mean that pixels are scrambled within the tile.
+       * TODO: what if this makes the coordinates too large?
+       */
+      x0 = (x0 * 2) & ~127;
+      y0 = (y0 / 2) & ~31;
+      x1 = ALIGN(x1 * 2, 128);
+      y1 = ALIGN(y1 / 2, 32);
+      wm_prog_key.use_kill = true;
+   }
+}
+
+uint32_t
+brw_blorp_blit_params::get_wm_prog(struct brw_context *brw,
+                                   brw_blorp_prog_data **prog_data) const
+{
+   uint32_t prog_offset;
+   if (!brw_search_cache(&brw->cache, BRW_BLORP_BLIT_PROG,
+                         &this->wm_prog_key, sizeof(this->wm_prog_key),
+                         &prog_offset, prog_data)) {
+      brw_blorp_blit_program prog(brw, &this->wm_prog_key);
+      GLuint program_size;
+      const GLuint *program = prog.compile(brw, &program_size);
+      brw_upload_cache(&brw->cache, BRW_BLORP_BLIT_PROG,
+                       &this->wm_prog_key, sizeof(this->wm_prog_key),
+                       program, program_size,
+                       &prog.prog_data, sizeof(prog.prog_data),
+                       &prog_offset, prog_data);
+   }
+   return prog_offset;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 141fb65..8ffd208 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -536,6 +536,7 @@ enum brw_cache_id {
    BRW_CC_VP,
    BRW_CC_UNIT,
    BRW_WM_PROG,
+   BRW_BLORP_BLIT_PROG,
    BRW_SAMPLER,
    BRW_WM_UNIT,
    BRW_SF_PROG,
@@ -1091,6 +1092,13 @@ void
 gen7_end_transform_feedback(struct gl_context *ctx,
 			    struct gl_transform_feedback_object *obj);
 
+/* brw_blorp_blit.cpp */
+GLbitfield
+brw_blorp_framebuffer(struct intel_context *intel,
+                      GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                      GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                      GLbitfield mask, GLenum filter);
+
 
 
 /*======================================================================
diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.cpp b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
index 33a1035..00aeda6 100644
--- a/src/mesa/drivers/dri/i965/gen6_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
@@ -144,7 +144,12 @@ gen6_blorp_emit_batch_head(struct brw_context *brw,
       OUT_RELOC(intel->batch.bo, (I915_GEM_DOMAIN_RENDER |
                                   I915_GEM_DOMAIN_INSTRUCTION), 0, 1);
       OUT_BATCH(1); /* IndirectObjectBaseAddress */
-      OUT_BATCH(1); /* InstructionBaseAddress */
+      if (params->use_wm_prog) {
+         OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+                   1); /* Instruction base address: shader kernels */
+      } else {
+         OUT_BATCH(1); /* InstructionBaseAddress */
+      }
       OUT_BATCH(1); /* GeneralStateUpperBound */
       OUT_BATCH(1); /* DynamicStateUpperBound */
       OUT_BATCH(1); /* IndirectObjectUpperBound*/
@@ -290,6 +295,51 @@ gen6_blorp_emit_urb_config(struct brw_context *brw,
 }
 
 
+/* BLEND_STATE */
+uint32_t
+gen6_blorp_emit_blend_state(struct brw_context *brw,
+                            const brw_blorp_params *params)
+{
+   uint32_t cc_blend_state_offset;
+
+   struct gen6_blend_state *blend = (struct gen6_blend_state *)
+      brw_state_batch(brw, AUB_TRACE_BLEND_STATE,
+                      sizeof(struct gen6_blend_state), 64,
+                      &cc_blend_state_offset);
+
+   memset(blend, 0, sizeof(*blend));
+
+   // TODO: handle other formats.
+   blend->blend1.pre_blend_clamp_enable = 1;
+   blend->blend1.post_blend_clamp_enable = 1;
+   blend->blend1.clamp_range = BRW_RENDERTARGET_CLAMPRANGE_FORMAT;
+
+   blend->blend1.write_disable_r = false;
+   blend->blend1.write_disable_g = false;
+   blend->blend1.write_disable_b = false;
+   blend->blend1.write_disable_a = false;
+
+   return cc_blend_state_offset;
+}
+
+
+/* CC_STATE */
+uint32_t
+gen6_blorp_emit_cc_state(struct brw_context *brw,
+                         const brw_blorp_params *params)
+{
+   uint32_t cc_state_offset;
+
+   struct gen6_color_calc_state *cc = (struct gen6_color_calc_state *)
+      brw_state_batch(brw, AUB_TRACE_CC_STATE,
+                      sizeof(gen6_color_calc_state), 64,
+                      &cc_state_offset);
+   memset(cc, 0, sizeof(*cc));
+
+   return cc_state_offset;
+}
+
+
 /**
  * \param out_offset is relative to
  *        CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
@@ -332,15 +382,202 @@ gen6_blorp_emit_depth_stencil_state(struct brw_context *brw,
 static void
 gen6_blorp_emit_cc_state_pointers(struct brw_context *brw,
                                   const brw_blorp_params *params,
-                                  uint32_t depthstencil_offset)
+                                  uint32_t cc_blend_state_offset,
+                                  uint32_t depthstencil_offset,
+                                  uint32_t cc_state_offset)
 {
    struct intel_context *intel = &brw->intel;
 
    BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (4 - 2));
-   OUT_BATCH(1); /* BLEND_STATE offset */
+   OUT_BATCH(cc_blend_state_offset | 1); /* BLEND_STATE offset */
    OUT_BATCH(depthstencil_offset | 1); /* DEPTH_STENCIL_STATE offset */
-   OUT_BATCH(1); /* COLOR_CALC_STATE offset */
+   OUT_BATCH(cc_state_offset | 1); /* COLOR_CALC_STATE offset */
+   ADVANCE_BATCH();
+}
+
+
+/* WM push constants */
+uint32_t
+gen6_blorp_emit_wm_constants(struct brw_context *brw,
+                             const brw_blorp_params *params)
+{
+   uint32_t wm_push_const_offset;
+
+   void *constants = brw_state_batch(brw, AUB_TRACE_WM_CONSTANTS,
+                                     sizeof(params->wm_push_consts),
+                                     32, &wm_push_const_offset);
+   memcpy(constants, &params->wm_push_consts,
+          sizeof(params->wm_push_consts));
+
+   return wm_push_const_offset;
+}
+
+
+/* SURFACE_STATE for renderbuffer or texture surface (see
+ * brw_update_renderbuffer_surface and brw_update_texture_surface)
+ */
+static uint32_t
+gen6_blorp_emit_surface_state(struct brw_context *brw,
+                              const brw_blorp_params *params,
+                              const brw_blorp_surface_info *surface,
+                              uint32_t read_domains, uint32_t write_domain)
+{
+   uint32_t wm_surf_offset;
+   uint32_t width, height;
+   surface->get_miplevel_dims(&width, &height);
+   if (surface->map_stencil_as_y_tiled) {
+      width *= 2;
+      height /= 2;
+   }
+   struct intel_region *region = surface->mt->region;
+
+   /* TODO: handle other formats */
+   uint32_t format = surface->map_stencil_as_y_tiled
+      ? BRW_SURFACEFORMAT_R8_UNORM : BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+
+   uint32_t *surf = (uint32_t *)
+      brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32,
+                      &wm_surf_offset);
+
+   surf[0] = (BRW_SURFACE_2D << BRW_SURFACE_TYPE_SHIFT |
+              BRW_SURFACE_MIPMAPLAYOUT_BELOW << BRW_SURFACE_MIPLAYOUT_SHIFT |
+              BRW_SURFACE_CUBEFACE_ENABLES |
+              format << BRW_SURFACE_FORMAT_SHIFT);
+
+   /* reloc */
+   surf[1] = region->bo->offset; /* No tile offsets needed */
+
+   surf[2] = (0 << BRW_SURFACE_LOD_SHIFT |
+              (width - 1) << BRW_SURFACE_WIDTH_SHIFT |
+              (height - 1) << BRW_SURFACE_HEIGHT_SHIFT);
+
+   uint32_t tiling = surface->map_stencil_as_y_tiled
+      ? BRW_SURFACE_TILED | BRW_SURFACE_TILED_Y
+      : brw_get_surface_tiling_bits(region->tiling);
+   uint32_t pitch_bytes = region->pitch * region->cpp;
+   if (surface->map_stencil_as_y_tiled)
+      pitch_bytes *= 2;
+   surf[3] = (tiling |
+              0 << BRW_SURFACE_DEPTH_SHIFT |
+              (pitch_bytes - 1) << BRW_SURFACE_PITCH_SHIFT);
+
+   surf[4] = 0;
+
+   surf[5] = (0 << BRW_SURFACE_X_OFFSET_SHIFT |
+              0 << BRW_SURFACE_Y_OFFSET_SHIFT |
+              (surface->mt->align_h == 4 ?
+               BRW_SURFACE_VERTICAL_ALIGN_ENABLE : 0));
+
+   /* Emit relocation to surface contents */
+   drm_intel_bo_emit_reloc(brw->intel.batch.bo,
+                           wm_surf_offset + 4,
+                           region->bo,
+                           surf[1] - region->bo->offset,
+                           read_domains, write_domain);
+
+   return wm_surf_offset;
+}
+
+
+/* BINDING_TABLE.  See brw_wm_binding_table(). */
+uint32_t
+gen6_blorp_emit_binding_table(struct brw_context *brw,
+                              const brw_blorp_params *params,
+                              uint32_t wm_surf_offset_renderbuffer,
+                              uint32_t wm_surf_offset_texture)
+{
+   uint32_t wm_bind_bo_offset;
+   uint32_t *bind = (uint32_t *)
+      brw_state_batch(brw, AUB_TRACE_BINDING_TABLE,
+                      sizeof(uint32_t) *
+                      BRW_BLORP_NUM_BINDING_TABLE_ENTRIES,
+                      32, /* alignment */
+                      &wm_bind_bo_offset);
+   bind[BRW_BLORP_RENDERBUFFER_BINDING_TABLE_INDEX] =
+      wm_surf_offset_renderbuffer;
+   bind[BRW_BLORP_TEXTURE_BINDING_TABLE_INDEX] = wm_surf_offset_texture;
+
+   return wm_bind_bo_offset;
+}
+
+
+/**
+ * SAMPLER_STATE.  See brw_update_sampler_state().
+ */
+static uint32_t
+gen6_blorp_emit_sampler_state(struct brw_context *brw,
+                              const brw_blorp_params *params)
+{
+   uint32_t sampler_offset;
+
+   struct brw_sampler_state *sampler = (struct brw_sampler_state *)
+      brw_state_batch(brw, AUB_TRACE_SAMPLER_STATE,
+                      sizeof(struct brw_sampler_state),
+                      32, &sampler_offset);
+   memset(sampler, 0, sizeof(*sampler));
+
+   sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+   sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
+   sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+
+   sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+
+   sampler->ss0.min_mag_neq = 1;
+
+   /* Set LOD bias: 
+    */
+   sampler->ss0.lod_bias = 0;
+
+   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+   /* Set BaseMipLevel, MaxLOD, MinLOD: 
+    *
+    * XXX: I don't think that using firstLevel, lastLevel works,
+    * because we always setup the surface state as if firstLevel ==
+    * level zero.  Probably have to subtract firstLevel from each of
+    * these:
+    */
+   sampler->ss0.base_level = U_FIXED(0, 1);
+
+   sampler->ss1.max_lod = U_FIXED(0, 6);
+   sampler->ss1.min_lod = U_FIXED(0, 6);
+
+   sampler->ss3.non_normalized_coord = 1;
+
+   sampler->ss3.address_round |= BRW_ADDRESS_ROUNDING_ENABLE_U_MIN |
+      BRW_ADDRESS_ROUNDING_ENABLE_V_MIN |
+      BRW_ADDRESS_ROUNDING_ENABLE_R_MIN;
+   sampler->ss3.address_round |= BRW_ADDRESS_ROUNDING_ENABLE_U_MAG |
+      BRW_ADDRESS_ROUNDING_ENABLE_V_MAG |
+      BRW_ADDRESS_ROUNDING_ENABLE_R_MAG;
+
+   return sampler_offset;
+}
+
+
+/**
+ * 3DSTATE_SAMPLER_STATE_POINTERS.  See upload_sampler_state_pointers().
+ */
+static void
+gen6_blorp_emit_sampler_state_pointers(struct brw_context *brw,
+                                       const brw_blorp_params *params,
+                                       uint32_t sampler_offset)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(4);
+   OUT_BATCH(_3DSTATE_SAMPLER_STATE_POINTERS << 16 |
+             VS_SAMPLER_STATE_CHANGE |
+             GS_SAMPLER_STATE_CHANGE |
+             PS_SAMPLER_STATE_CHANGE |
+             (4 - 2));
+   OUT_BATCH(0); /* VS */
+   OUT_BATCH(0); /* GS */
+   OUT_BATCH(sampler_offset);
    ADVANCE_BATCH();
 }
 
@@ -465,21 +702,27 @@ gen6_blorp_emit_sf_config(struct brw_context *brw,
 
 
 /**
- * Disable thread dispatch (dw5.19) and enable the HiZ op.
+ * Enable or disable thread dispatch and set the HiZ op appropriately.
  */
 static void
 gen6_blorp_emit_wm_config(struct brw_context *brw,
-                          const brw_blorp_params *params)
+                          const brw_blorp_params *params,
+                          uint32_t prog_offset,
+                          brw_blorp_prog_data *prog_data)
 {
    struct intel_context *intel = &brw->intel;
+   uint32_t dw2, dw4, dw5, dw6;
 
-   /* Even though thread dispatch is disabled, max threads (dw5.25:31) must be
+   /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
     * nonzero to prevent the GPU from hanging. See the valid ranges in the
     * BSpec, Volume 2a.11 Windower, Section 3DSTATE_WM, Dword 5.25:31
     * "Maximum Number Of Threads".
+    *
+    * To be safe (and to minimize extraneous code) we go ahead and fully
+    * configure the WM state whether or not there is a WM program.
     */
-   uint32_t dw4 = 0;
 
+   dw2 = dw4 = dw5 = dw6 = 0;
    switch (params->hiz_op) {
    case GEN6_HIZ_OP_DEPTH_CLEAR:
       assert(!"not implemented");
@@ -491,25 +734,89 @@ gen6_blorp_emit_wm_config(struct brw_context *brw,
    case GEN6_HIZ_OP_HIZ_RESOLVE:
       dw4 |= GEN6_WM_HIERARCHICAL_DEPTH_RESOLVE;
       break;
+   case GEN6_HIZ_OP_NONE:
+      break;
    default:
       assert(0);
       break;
    }
+   dw4 |= GEN6_WM_STATISTICS_ENABLE;
+   dw5 |= GEN6_WM_LINE_AA_WIDTH_1_0;
+   dw5 |= GEN6_WM_LINE_END_CAP_AA_WIDTH_0_5;
+   dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
+   dw6 |= 0 << GEN6_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT; /* No interp */
+   dw6 |= 0 << GEN6_WM_NUM_SF_OUTPUTS_SHIFT; /* No inputs from SF */
+   if (params->use_wm_prog) {
+      dw2 |= 1 << GEN6_WM_SAMPLER_COUNT_SHIFT; /* Up to 4 samplers */
+      dw4 |= prog_data->first_curbe_grf << GEN6_WM_DISPATCH_START_GRF_SHIFT_0;
+      dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+      dw5 |= GEN6_WM_KILL_ENABLE; /* TODO: temporarily smash on */
+      dw5 |= GEN6_WM_DISPATCH_ENABLE; /* We are rendering */
+   }
 
    BEGIN_BATCH(9);
    OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2));
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
+   OUT_BATCH(params->use_wm_prog ? prog_offset : 0);
+   OUT_BATCH(dw2);
+   OUT_BATCH(0); /* No scratch needed */
    OUT_BATCH(dw4);
-   OUT_BATCH((brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT);
-   OUT_BATCH((1 - 1) << GEN6_WM_NUM_SF_OUTPUTS_SHIFT); /* only position */
+   OUT_BATCH(dw5);
+   OUT_BATCH(dw6); /* only position */
+   OUT_BATCH(0); /* No other programs */
+   OUT_BATCH(0); /* No other programs */
+   ADVANCE_BATCH();
+}
+
+
+static void
+gen6_blorp_emit_constant_ps(struct brw_context *brw,
+                            const brw_blorp_params *params,
+                            uint32_t wm_push_const_offset)
+{
+   struct intel_context *intel = &brw->intel;
+
+   /* Make sure the push constants fill an exact integer number of
+    * registers.
+    */
+   assert(sizeof(brw_blorp_wm_push_constants) % 32 == 0);
+
+   /* There must be at least one register worth of push constant data. */
+   assert(BRW_BLORP_NUM_PUSH_CONST_REGS > 0);
+
+   /* Enable push constant buffer 0. */
+   BEGIN_BATCH(5);
+   OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 |
+             GEN6_CONSTANT_BUFFER_0_ENABLE |
+             (5 - 2));
+   OUT_BATCH(wm_push_const_offset + (BRW_BLORP_NUM_PUSH_CONST_REGS - 1));
+   OUT_BATCH(0);
    OUT_BATCH(0);
    OUT_BATCH(0);
    ADVANCE_BATCH();
 }
 
 
+/**
+ * 3DSTATE_BINDING_TABLE_POINTERS
+ */
+static void
+gen6_blorp_emit_binding_table_pointers(struct brw_context *brw,
+                                       const brw_blorp_params *params,
+                                       uint32_t wm_bind_bo_offset)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(4);
+   OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS << 16 |
+             GEN6_BINDING_TABLE_MODIFY_PS |
+             (4 - 2));
+   OUT_BATCH(0); /* vs -- ignored */
+   OUT_BATCH(0); /* gs -- ignored */
+   OUT_BATCH(wm_bind_bo_offset); /* wm/ps */
+   ADVANCE_BATCH();
+}
+
+
 static void
 gen6_blorp_emit_depth_stencil_config(struct brw_context *brw,
                                      const brw_blorp_params *params)
@@ -606,6 +913,25 @@ gen6_blorp_emit_depth_stencil_config(struct brw_context *brw,
 }
 
 
+static void
+gen6_blorp_emit_depth_disable(struct brw_context *brw,
+                              const brw_blorp_params *params)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(7);
+   OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
+   OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
+             (BRW_SURFACE_NULL << 29));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+}
+
+
 /* 3DSTATE_CLEAR_PARAMS
  *
  * From the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE_CLEAR_PARAMS:
@@ -677,20 +1003,57 @@ gen6_blorp_exec(struct intel_context *intel,
 {
    struct gl_context *ctx = &intel->ctx;
    struct brw_context *brw = brw_context(ctx);
+   brw_blorp_prog_data *prog_data = NULL;
+   uint32_t cc_blend_state_offset = 0;
+   uint32_t cc_state_offset = 0;
    uint32_t depthstencil_offset;
+   uint32_t wm_push_const_offset = 0;
+   uint32_t wm_bind_bo_offset = 0;
 
+   uint32_t prog_offset = params->get_wm_prog(brw, &prog_data);
    gen6_blorp_emit_batch_head(brw, params);
    gen6_blorp_emit_vertices(brw, params);
    gen6_blorp_emit_urb_config(brw, params);
+   if (params->use_wm_prog) {
+      cc_blend_state_offset = gen6_blorp_emit_blend_state(brw, params);
+      cc_state_offset = gen6_blorp_emit_cc_state(brw, params);
+   }
    depthstencil_offset = gen6_blorp_emit_depth_stencil_state(brw, params);
-   gen6_blorp_emit_cc_state_pointers(brw, params, depthstencil_offset);
+   gen6_blorp_emit_cc_state_pointers(brw, params, cc_blend_state_offset,
+                                     depthstencil_offset, cc_state_offset);
+   if (params->use_wm_prog) {
+      uint32_t wm_surf_offset_renderbuffer;
+      uint32_t wm_surf_offset_texture;
+      uint32_t sampler_offset;
+      wm_push_const_offset = gen6_blorp_emit_wm_constants(brw, params);
+      wm_surf_offset_renderbuffer =
+         gen6_blorp_emit_surface_state(brw, params, &params->dst,
+                                       I915_GEM_DOMAIN_RENDER,
+                                       I915_GEM_DOMAIN_RENDER);
+      wm_surf_offset_texture =
+         gen6_blorp_emit_surface_state(brw, params, &params->src,
+                                       I915_GEM_DOMAIN_SAMPLER, 0);
+      wm_bind_bo_offset =
+         gen6_blorp_emit_binding_table(brw, params,
+                                       wm_surf_offset_renderbuffer,
+                                       wm_surf_offset_texture);
+      sampler_offset = gen6_blorp_emit_sampler_state(brw, params);
+      gen6_blorp_emit_sampler_state_pointers(brw, params, sampler_offset);
+   }
    gen6_blorp_emit_vs_disable(brw, params);
    gen6_blorp_emit_gs_disable(brw, params);
    gen6_blorp_emit_clip_disable(brw, params);
    gen6_blorp_emit_sf_config(brw, params);
-   gen6_blorp_emit_wm_config(brw, params);
-
-   gen6_blorp_emit_depth_stencil_config(brw, params);
+   if (params->use_wm_prog)
+      gen6_blorp_emit_constant_ps(brw, params, wm_push_const_offset);
+   gen6_blorp_emit_wm_config(brw, params, prog_offset, prog_data);
+   if (params->use_wm_prog)
+      gen6_blorp_emit_binding_table_pointers(brw, params, wm_bind_bo_offset);
+
+   if (params->depth.mt)
+      gen6_blorp_emit_depth_stencil_config(brw, params);
+   else
+      gen6_blorp_emit_depth_disable(brw, params);
    gen6_blorp_emit_clear_params(brw, params);
    gen6_blorp_emit_drawing_rectangle(brw, params);
    gen6_blorp_emit_primitive(brw, params);
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index 64badcc..f10d0aa 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -81,6 +81,36 @@ gen7_blorp_emit_urb_config(struct brw_context *brw,
 }
 
 
+/* 3DSTATE_BLEND_STATE_POINTERS */
+static void
+gen7_blorp_emit_blend_state_pointer(struct brw_context *brw,
+                                    const brw_blorp_params *params,
+                                    uint32_t cc_blend_state_offset)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(2);
+   OUT_BATCH(_3DSTATE_BLEND_STATE_POINTERS << 16 | (2 - 2));
+   OUT_BATCH(cc_blend_state_offset | 1);
+   ADVANCE_BATCH();
+}
+
+
+/* 3DSTATE_CC_STATE_POINTERS */
+static void
+gen7_blorp_emit_cc_state_pointer(struct brw_context *brw,
+                                 const brw_blorp_params *params,
+                                 uint32_t cc_state_offset)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(2);
+   OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
+   OUT_BATCH(cc_state_offset | 1);
+   ADVANCE_BATCH();
+}
+
+
 /* 3DSTATE_DEPTH_STENCIL_STATE_POINTERS
  *
  * The offset is relative to CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
@@ -99,6 +129,134 @@ gen7_blorp_emit_depth_stencil_state_pointers(struct brw_context *brw,
 }
 
 
+/* SURFACE_STATE for renderbuffer or texture surface (see
+ * brw_update_renderbuffer_surface and brw_update_texture_surface)
+ */
+static uint32_t
+gen7_blorp_emit_surface_state(struct brw_context *brw,
+                              const brw_blorp_params *params,
+                              const brw_blorp_surface_info *surface,
+                              uint32_t read_domains, uint32_t write_domain)
+{
+   struct intel_context *intel = &brw->intel;
+
+   uint32_t wm_surf_offset;
+   uint32_t width, height;
+   surface->get_miplevel_dims(&width, &height);
+   if (surface->map_stencil_as_y_tiled) {
+      width *= 2;
+      height /= 2;
+   }
+   struct intel_region *region = surface->mt->region;
+
+   /* TODO: handle other formats */
+   uint32_t format = surface->map_stencil_as_y_tiled
+      ? BRW_SURFACEFORMAT_R8_UNORM : BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+
+   struct gen7_surface_state *surf = (struct gen7_surface_state *)
+      brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, sizeof(*surf), 32,
+                      &wm_surf_offset);
+   memset(surf, 0, sizeof(*surf));
+
+   if (surface->mt->align_h == 4)
+      surf->ss0.vertical_alignment = 1;
+   if (surface->mt->align_w == 8)
+      surf->ss0.horizontal_alignment = 1;
+
+   surf->ss0.surface_format = format;
+   surf->ss0.surface_type = BRW_SURFACE_2D;
+
+   /* reloc */
+   surf->ss1.base_addr = region->bo->offset; /* No tile offsets needed */
+
+   surf->ss2.width = width - 1;
+   surf->ss2.height = height - 1;
+
+   uint32_t tiling = surface->map_stencil_as_y_tiled
+      ? I915_TILING_Y : region->tiling;
+   gen7_set_surface_tiling(surf, tiling);
+
+   uint32_t pitch_bytes = region->pitch * region->cpp;
+   if (surface->map_stencil_as_y_tiled)
+      pitch_bytes *= 2;
+   surf->ss3.pitch = pitch_bytes - 1;
+
+   if (intel->is_haswell) {
+      surf->ss7.shader_chanel_select_r = HSW_SCS_RED;
+      surf->ss7.shader_chanel_select_g = HSW_SCS_GREEN;
+      surf->ss7.shader_chanel_select_b = HSW_SCS_BLUE;
+      surf->ss7.shader_chanel_select_a = HSW_SCS_ALPHA;
+   }
+
+   /* Emit relocation to surface contents */
+   drm_intel_bo_emit_reloc(brw->intel.batch.bo,
+                           wm_surf_offset +
+                           offsetof(struct gen7_surface_state, ss1),
+                           region->bo,
+                           surf->ss1.base_addr - region->bo->offset,
+                           read_domains, write_domain);
+
+   return wm_surf_offset;
+}
+
+
+/**
+ * SAMPLER_STATE.  See gen7_update_sampler_state().
+ */
+static uint32_t
+gen7_blorp_emit_sampler_state(struct brw_context *brw,
+                              const brw_blorp_params *params)
+{
+   uint32_t sampler_offset;
+
+   struct gen7_sampler_state *sampler = (struct gen7_sampler_state *)
+      brw_state_batch(brw, AUB_TRACE_SAMPLER_STATE,
+                      sizeof(struct gen7_sampler_state),
+                      32, &sampler_offset);
+   memset(sampler, 0, sizeof(*sampler));
+
+   sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+   sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
+   sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+
+   sampler->ss3.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   sampler->ss3.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+   sampler->ss3.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+
+   //   sampler->ss0.min_mag_neq = 1;
+
+   /* Set LOD bias: 
+    */
+   sampler->ss0.lod_bias = 0;
+
+   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+   /* Set BaseMipLevel, MaxLOD, MinLOD: 
+    *
+    * XXX: I don't think that using firstLevel, lastLevel works,
+    * because we always setup the surface state as if firstLevel ==
+    * level zero.  Probably have to subtract firstLevel from each of
+    * these:
+    */
+   sampler->ss0.base_level = U_FIXED(0, 1);
+
+   sampler->ss1.max_lod = U_FIXED(0, 8);
+   sampler->ss1.min_lod = U_FIXED(0, 8);
+
+   sampler->ss3.non_normalized_coord = 1;
+
+   sampler->ss3.address_round |= BRW_ADDRESS_ROUNDING_ENABLE_U_MIN |
+      BRW_ADDRESS_ROUNDING_ENABLE_V_MIN |
+      BRW_ADDRESS_ROUNDING_ENABLE_R_MIN;
+   sampler->ss3.address_round |= BRW_ADDRESS_ROUNDING_ENABLE_U_MAG |
+      BRW_ADDRESS_ROUNDING_ENABLE_V_MAG |
+      BRW_ADDRESS_ROUNDING_ENABLE_R_MAG;
+
+   return sampler_offset;
+}
+
+
 /* 3DSTATE_HS
  *
  * Disable the hull shader.
@@ -256,6 +414,14 @@ gen7_blorp_emit_wm_config(struct brw_context *brw,
       assert(0);
       break;
    }
+   dw1 |= GEN7_WM_STATISTICS_ENABLE;
+   dw1 |= GEN7_WM_LINE_AA_WIDTH_1_0;
+   dw1 |= GEN7_WM_LINE_END_CAP_AA_WIDTH_0_5;
+   dw1 |= 0 << GEN7_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT; /* No interp */
+   if (params->use_wm_prog) {
+      dw1 |= GEN7_WM_KILL_ENABLE; /* TODO: temporarily smash on */
+      dw1 |= GEN7_WM_DISPATCH_ENABLE; /* We are rendering */
+   }
 
    BEGIN_BATCH(3);
    OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2));
@@ -278,17 +444,89 @@ gen7_blorp_emit_wm_config(struct brw_context *brw,
  */
 static void
 gen7_blorp_emit_ps_config(struct brw_context *brw,
-                          const brw_blorp_params *params)
+                          const brw_blorp_params *params,
+                          uint32_t prog_offset,
+                          brw_blorp_prog_data *prog_data)
 {
    struct intel_context *intel = &brw->intel;
+   uint32_t dw2, dw4, dw5;
+   const int max_threads_shift = brw->intel.is_haswell ?
+      HSW_PS_MAX_THREADS_SHIFT : IVB_PS_MAX_THREADS_SHIFT;
+
+   dw2 = dw4 = dw5 = 0;
+   dw4 |= (brw->max_wm_threads - 1) << max_threads_shift;
+   dw4 |= GEN7_PS_32_DISPATCH_ENABLE;
+   if (intel->is_haswell)
+      dw4 |= SET_FIELD(1, HSW_PS_SAMPLE_MASK); /* 1 sample for now */
+   if (params->use_wm_prog) {
+      dw2 |= 1 << GEN7_PS_SAMPLER_COUNT_SHIFT; /* Up to 4 samplers */
+      dw4 |= GEN7_PS_PUSH_CONSTANT_ENABLE;
+      dw5 |= prog_data->first_curbe_grf << GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
+   }
 
    BEGIN_BATCH(8);
    OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
+   OUT_BATCH(params->use_wm_prog ? prog_offset : 0);
+   OUT_BATCH(dw2);
    OUT_BATCH(0);
+   OUT_BATCH(dw4);
+   OUT_BATCH(dw5);
    OUT_BATCH(0);
    OUT_BATCH(0);
-   OUT_BATCH(((brw->max_wm_threads - 1) << IVB_PS_MAX_THREADS_SHIFT) |
-             GEN7_PS_32_DISPATCH_ENABLE);
+   ADVANCE_BATCH();
+}
+
+
+static void
+gen7_blorp_emit_binding_table_pointers_ps(struct brw_context *brw,
+                                          const brw_blorp_params *params,
+                                          uint32_t wm_bind_bo_offset)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(2);
+   OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_PS << 16 | (2 - 2));
+   OUT_BATCH(wm_bind_bo_offset);
+   ADVANCE_BATCH();
+}
+
+
+static void
+gen7_blorp_emit_sampler_state_pointers_ps(struct brw_context *brw,
+                                          const brw_blorp_params *params,
+                                          uint32_t sampler_offset)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(2);
+   OUT_BATCH(_3DSTATE_SAMPLER_STATE_POINTERS_PS << 16 | (2 - 2));
+   OUT_BATCH(sampler_offset);
+   ADVANCE_BATCH();
+}
+
+
+static void
+gen7_blorp_emit_constant_ps(struct brw_context *brw,
+                            const brw_blorp_params *params,
+                            uint32_t wm_push_const_offset)
+{
+   struct intel_context *intel = &brw->intel;
+
+   /* Make sure the push constants fill an exact integer number of
+    * registers.
+    */
+   assert(sizeof(brw_blorp_wm_push_constants) % 32 == 0);
+
+   /* There must be at least one register worth of push constant data. */
+   assert(BRW_BLORP_NUM_PUSH_CONST_REGS > 0);
+
+   /* Enable push constant buffer 0. */
+   BEGIN_BATCH(7);
+   OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 |
+             (7 - 2));
+   OUT_BATCH(BRW_BLORP_NUM_PUSH_CONST_REGS);
+   OUT_BATCH(0);
+   OUT_BATCH(wm_push_const_offset);
    OUT_BATCH(0);
    OUT_BATCH(0);
    OUT_BATCH(0);
@@ -304,8 +542,10 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
    uint32_t draw_x, draw_y;
    uint32_t tile_mask_x, tile_mask_y;
 
-   params->depth.get_draw_offsets(&draw_x, &draw_y);
-   gen6_blorp_compute_tile_masks(params, &tile_mask_x, &tile_mask_y);
+   if (params->depth.mt) {
+      params->depth.get_draw_offsets(&draw_x, &draw_y);
+      gen6_blorp_compute_tile_masks(params, &tile_mask_x, &tile_mask_y);
+   }
 
    /* 3DSTATE_DEPTH_BUFFER */
    {
@@ -388,6 +628,24 @@ gen7_blorp_emit_depth_stencil_config(struct brw_context *brw,
 }
 
 
+static void
+gen7_blorp_emit_depth_disable(struct brw_context *brw,
+                              const brw_blorp_params *params)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(7);
+   OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
+   OUT_BATCH(BRW_DEPTHFORMAT_D32_FLOAT << 18 | (BRW_SURFACE_NULL << 29));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+}
+
+
 /* 3DSTATE_CLEAR_PARAMS
  *
  * From the BSpec, Volume 2a.11 Windower, Section 1.5.6.3.2
@@ -439,14 +697,44 @@ gen7_blorp_exec(struct intel_context *intel,
 {
    struct gl_context *ctx = &intel->ctx;
    struct brw_context *brw = brw_context(ctx);
+   brw_blorp_prog_data *prog_data = NULL;
+   uint32_t cc_blend_state_offset = 0;
+   uint32_t cc_state_offset = 0;
    uint32_t depthstencil_offset;
+   uint32_t wm_push_const_offset = 0;
+   uint32_t wm_bind_bo_offset = 0;
+   uint32_t sampler_offset = 0;
 
+   uint32_t prog_offset = params->get_wm_prog(brw, &prog_data);
    gen6_blorp_emit_batch_head(brw, params);
    gen6_blorp_emit_vertices(brw, params);
    gen7_blorp_emit_urb_config(brw, params);
+   if (params->use_wm_prog) {
+      cc_blend_state_offset = gen6_blorp_emit_blend_state(brw, params);
+      cc_state_offset = gen6_blorp_emit_cc_state(brw, params);
+      gen7_blorp_emit_blend_state_pointer(brw, params, cc_blend_state_offset);
+      gen7_blorp_emit_cc_state_pointer(brw, params, cc_state_offset);
+   }
    depthstencil_offset = gen6_blorp_emit_depth_stencil_state(brw, params);
    gen7_blorp_emit_depth_stencil_state_pointers(brw, params,
                                                 depthstencil_offset);
+   if (params->use_wm_prog) {
+      uint32_t wm_surf_offset_renderbuffer;
+      uint32_t wm_surf_offset_texture;
+      wm_push_const_offset = gen6_blorp_emit_wm_constants(brw, params);
+      wm_surf_offset_renderbuffer =
+         gen7_blorp_emit_surface_state(brw, params, &params->dst,
+                                       I915_GEM_DOMAIN_RENDER,
+                                       I915_GEM_DOMAIN_RENDER);
+      wm_surf_offset_texture =
+         gen7_blorp_emit_surface_state(brw, params, &params->src,
+                                       I915_GEM_DOMAIN_SAMPLER, 0);
+      wm_bind_bo_offset =
+         gen6_blorp_emit_binding_table(brw, params,
+                                       wm_surf_offset_renderbuffer,
+                                       wm_surf_offset_texture);
+      sampler_offset = gen7_blorp_emit_sampler_state(brw, params);
+   }
    gen6_blorp_emit_vs_disable(brw, params);
    gen7_blorp_emit_hs_disable(brw, params);
    gen7_blorp_emit_te_disable(brw, params);
@@ -456,9 +744,18 @@ gen7_blorp_exec(struct intel_context *intel,
    gen6_blorp_emit_clip_disable(brw, params);
    gen7_blorp_emit_sf_config(brw, params);
    gen7_blorp_emit_wm_config(brw, params);
-   gen7_blorp_emit_ps_config(brw, params);
+   if (params->use_wm_prog) {
+      gen7_blorp_emit_binding_table_pointers_ps(brw, params,
+                                                wm_bind_bo_offset);
+      gen7_blorp_emit_sampler_state_pointers_ps(brw, params, sampler_offset);
+      gen7_blorp_emit_constant_ps(brw, params, wm_push_const_offset);
+   }
+   gen7_blorp_emit_ps_config(brw, params, prog_offset, prog_data);
 
-   gen7_blorp_emit_depth_stencil_config(brw, params);
+   if (params->depth.mt)
+      gen7_blorp_emit_depth_stencil_config(brw, params);
+   else
+      gen7_blorp_emit_depth_disable(brw, params);
    gen7_blorp_emit_clear_params(brw, params);
    gen6_blorp_emit_drawing_rectangle(brw, params);
    gen7_blorp_emit_primitive(brw, params);
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index 2f95ad0..f92d78f 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -823,6 +823,15 @@ intel_blit_framebuffer(struct gl_context *ctx,
    if (mask == 0x0)
       return;
 
+#ifndef I915
+   mask = brw_blorp_framebuffer(intel_context(ctx),
+                                srcX0, srcY0, srcX1, srcY1,
+                                dstX0, dstY0, dstX1, dstY1,
+                                mask, filter);
+   if (mask == 0x0)
+      return;
+#endif
+
    _mesa_meta_BlitFramebuffer(ctx,
                               srcX0, srcY0, srcX1, srcY1,
                               dstX0, dstY0, dstX1, dstY1,
-- 
1.7.7.6



More information about the mesa-dev mailing list