[Mesa-dev] [PATCH 18/22] i965/fs/gen6: Support for sampling stencil with non-msaa coordinates

Mon Jun 9 00:45:52 PDT 2014

Signed-off-by: Topi Pohjolainen <topi.pohjolainen at intel.com>
---
 src/mesa/drivers/dri/i965/Makefile.sources       |   1 +
 src/mesa/drivers/dri/i965/brw_fs.h               |   1 +
 src/mesa/drivers/dri/i965/brw_fs_stencil_tex.cpp | 411 +++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs_stencil_tex.h   |  74 ++++
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp     |  11 +
 5 files changed, 498 insertions(+)
 create mode 100644 src/mesa/drivers/dri/i965/brw_fs_stencil_tex.cpp
 create mode 100644 src/mesa/drivers/dri/i965/brw_fs_stencil_tex.h

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index d43fc8e..179ea67 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -68,6 +68,7 @@ i965_FILES = \
 	brw_fs_saturate_propagation.cpp \
 	brw_fs_sel_peephole.cpp \
 	brw_fs_vector_splitting.cpp \
+	brw_fs_stencil_tex.cpp \
 	brw_fs_visitor.cpp \
 	brw_fs_emitter.cpp \
 	brw_gs.c \
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 55877c1..e1f540d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -52,6 +52,7 @@ extern "C" {
 #include "glsl/glsl_types.h"
 #include "glsl/ir.h"
 #include "brw_fs_emit.h"
+#include "brw_fs_stencil_tex.h"
 
 #define MAX_SAMPLER_MESSAGE_SIZE 11
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_stencil_tex.cpp b/src/mesa/drivers/dri/i965/brw_fs_stencil_tex.cpp
new file mode 100644
index 0000000..2d813e9
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_stencil_tex.cpp
@@ -0,0 +1,411 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE e->AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_stencil_tex.cpp
+ *
+ * Support for coordinate translations needed for sampling stencil buffers.
+ * Stencil buffers are W-tiled which is unfortunately not understood by the
+ * sampling engine. The surface is setup as Y-tiled instead and the fragment
+ * program is augmented with instructions translating the coordinates into
+ * equivalent in Y-tiled space.
+ *
+ * In W-tiling four 4x4 blocks form an 8x8 block. These 8x8 blocks are laid
+ * out in column major order as follows:
+ *
+ *           0             1                    7
+ *    +-------------+-------------+-----+---------------+
+ *  0 | 0, 0 | 1, 0 | 2, 0 | 3, 0 | ... | 14, 0 | 15, 0 |
+ *    | 0, 1 | 1, 1 | 2, 1 | 3, 1 | ... | 14, 1 | 15, 1 |
+ *    + ...                                             +
+ *  7 | 0,14 | 1,14 | 2,14 | 3,14 | ... | 14,14 | 15,14 |
+ *    | 0,15 | 1,15 | 2,15 | 3,15 | ... | 14,15 | 15,15 |
+ *    +-------------------------------------------------+
+ *                W-tile (8x8 16x16 blocks)
+ *
+ * In Y-tiling the 16x1 sub-tiles are also laid out in column major order,
+ * and the 4x4 sub-tiles (designated by their w-tiled coordnates) can be seen
+ * to be located in the Y-tiled layout as follows:
+ *
+ *     ------------------------------------
+ *   0 | 0, 0 | 2, 0 | 4, 0 | ... | 14, 0 |
+ *   1 | 1, 0 | 3, 0 | 5, 0 | ... | 15, 0 |
+ *   2 | 0, 1 | 2, 1 | 4, 1 | ... | 14, 1 |
+ *   3 | 1, 1 | 3, 1 | 5, 1 | ... | 15, 1 |
+ *     | ...                              |
+ *  30 | 0,15 | 2,15 | 4,15 | ... | 14,15 |
+ *  31 | 1,15 | 3,15 | 5,15 | ... | 15,15 |
+ *     ------------------------------------
+ *          Y-tile (8x32 16x1 blocks)
+ */
+#include "brw_fs_stencil_tex.h"
+#include "brw_defines.h"
+#include "intel_mipmap_tree.h"
+#include "program/hash_table.h"
+
+namespace {
+
+/**
+ * Emit translation of pixel coordinates src_x and src_y in W-tiled layout
+ * to corresponding coordinates dst_x and dst_y in Y-tiled layout.
+ * Note that source and destination registers cannot overlap.
+ *
+ * Both W-tiling and Y-tiling have equal tile size of one page. The difference
+ * is in how pixels are organised within the page: W-tile has 64 rows each
+ * holding in turn two 32 byte sub-tiles whereas Y-tile has 32 rows each
+ * holding eight 16 byte sub-tiles. The sub-tiles in turn have different
+ * layout: W is 8x4 bytes and Y is 16x1. Now, in Y-tiling two subsequent tiles
+ * are on top of each other. If each pair is thought to form one tile instead
+ * one can think Y-tiling to consist of 16 rows and eight columns of 32-byte
+ * subtiles.
+ *
+ * This organisation is independent of the pixel format used and
+ * hence the number of pixels within a tile varies based on how many bytes
+ * per pixel are needed.
+ *
+ * The operation here is fixed to one-byte-per-pixel formats only - it
+ * assumes that Y-subtile holds 16 pixels per row (and W 8 respectively).
+ * 
+ * First examine the X coordinate representing an address using W-tiling.
+ * The lowest six bits represent a column within a tile while the higher bits
+ * designate a tile number horizontally.
+ * As a Y-tile can hold twice as many pixels horizontally than W-tile, the
+ * tile number needs to be multiplied by two in order to move to the desired
+ * tile horizontally:
+ *
+ *    (X & ~0b111) << 1  ==  (X & 0xFFF8) << 1           (1)
+ *
+ * The lowest six can be further divided in two parts - the subtile number
+ * and then the remaining coordinate within the subtile. These are three and
+ * three respectively for W-tiling. Then unlike Y-subtile W is further
+ * divided into 4x4 and again into 2x2 tiles. Hence the third lowest bit
+ * represents the 4x4-subtile number, the second lowest the 2x2-subtile
+ * number and finally the lowest the offset within the 2x2 block.
+ *
+ *     01 23  45 67        0123456789ABCDEF              W      Y
+ *   ++==+==++==+==++      +---------------+       a:   3,1    7,0
+ * 0 ||  |  ||  |  ||    0 |       a       |       b:   1,2    1,1
+ * 1 ||  | a||  |  ||    1 | b       c     |       c:   5,3   10,1
+ *   ++--+--++--+--++      +---------------+
+ * 2 || b|  ||  |  ||
+ * 3 ||  |  ||c |  ||
+ *   ++==+==++==+==++
+ *
+ * Observing the W layout it can be seen that x-coordinates creater or equal
+ * to four reside on the second half of the subtile - in Y-tile this
+ * corresponds to the second row calling for the following compensation in
+ * the vertical coordinate:
+ *
+ *    (X & 0b100) >> 2   ==   (X & 0x4) >> 2              (2)
+ *
+ * The 2x2 subtile in turn results into two x-coordinates x and x + 2 in the
+ * same row (in the same 8x4 subtile) to be 4 bytes apart in linear memory.
+ * As addresses in Y-subtile itself are linear, the compensation in the
+ * horizontal coordinate is:
+ *
+ *    (X & 0b01) << 1                                     (3) 
+ *
+ * Combined with (1):
+ *
+ *    (X & ~0b101) << 1  ==  (X & 0xFFFA) << 1            (4)
+ *
+ * Similarly in 4x4 W-subtile in the same column any two y-coordinates y and
+ * y + 2 are 8 bytes apart in linear memory addresses. In 2x2-subtile in turn
+ * y and y + 1 are two bytes apart. This results into horizontal compensation
+ * in Y layout:
+ *
+ *    (Y & 0b10) << 2 | (Y & 0b1) << 1                    (5)
+ *
+ * Taking into account (4) and (5), one gets:
+ *
+ *    X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
+ *
+ * For y-coordinate one needs to consider full tiles and (2). As Y-layout
+ * has twice as many tiles as W horizontally, the number of tiles in vertical
+ * direction needs to be divided by two. Every two tiles on top each other
+ * in W-layout are "re-layouted" horiontally.
+ *
+ *    Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
+ */
+static void
+emit_translate_w_to_y_tiling(fs_emitter *e, const fs_reg& coord)
+{
+   fs_reg src_x(coord);
+   fs_reg src_y(offset(coord, 1));
+   fs_reg dst_x(e, glsl_type::uint_type);
+   fs_reg dst_y(e, glsl_type::uint_type);
+   fs_reg t1(e, glsl_type::uint_type);
+   fs_reg t2(e, glsl_type::uint_type);
+
+   e->emit(e->AND(t1, src_x, fs_reg(0xfffa))); /* X & ~0b101 */
+   e->emit(e->SHL(t1, t1, fs_reg(1))); /* (X & ~0b101) << 1 */
+   e->emit(e->AND(t2, src_y, fs_reg(2))); /* Y & 0b10 */
+   e->emit(e->SHL(t2, t2, fs_reg(2))); /* (Y & 0b10) << 2 */
+   e->emit(e->OR(t1, t1, t2)); /* (X & ~0b101) << 1 | (Y & 0b10) << 2 */
+   e->emit(e->AND(t2, src_y, fs_reg(1))); /* Y & 0b1 */
+   e->emit(e->SHL(t2, t2, fs_reg(1))); /* (Y & 0b1) << 1 */
+   e->emit(e->OR(t1, t1, t2)); /* (X & ~0b101) << 1 | (Y & 0b10) << 2
+                               | (Y & 0b1) << 1 */
+   e->emit(e->AND(t2, src_x, fs_reg(1))); /* X & 0b1 */
+   e->emit(e->OR(dst_x, t1, t2));
+   e->emit(e->AND(t1, src_y, fs_reg(0xfffc))); /* Y & ~0b11 */
+   e->emit(e->SHR(t1, t1, fs_reg(1))); /* (Y & ~0b11) >> 1 */
+   e->emit(e->AND(t2, src_x, fs_reg(4))); /* X & 0b100 */
+   e->emit(e->SHR(t2, t2, fs_reg(2))); /* (X & 0b100) >> 2 */
+   e->emit(e->OR(dst_y, t1, t2));
+
+   /* Finally write the translated over the original. */
+   e->emit(e->MOV(coord, dst_x));
+   e->emit(e->MOV(offset(coord, 1), dst_y));
+}
+
+/**
+ * All the miptrees have the same "below" layout where both levels one and two
+ * are just below level zero. From there on level three is just below level
+ * two, level four just below level three and so on.
+ * The operation requires one temporary register.
+ *
+ * The heights of the levels of a miptree form a geometric sequence:
+ *
+ *     h/1 + h/2 + h/4 + ... + h/2^n
+ *   = h * 0.5^0 + h * 0.5^1 + h * 0.5^2 + ... + h * 0.5^(n - 1)
+ *   = h * (1 - 0.5^n) / (1 - 0.5)
+ *   = h * 2 * (1 - 0.5^n)
+ *
+ * Unfortunately individual levels are further aligned by four (< gen7) and
+ * by eight on later hardware preventing the use of the simple formula for the
+ * sum. Instead the implementation calculates the sum by iterating over the
+ * given 'n'.
+ */
+static void
+emit_calc_level_y_offset(fs_emitter *e, const fs_reg& height,
+                         const fs_reg& level, const fs_reg& y)
+{
+   const unsigned align_h = e->brw->gen == 6 ? 4 : 8;
+   const fs_reg align_h_mask(0xffff - (align_h - 1));
+   fs_reg curr_h(e, glsl_type::uint_type);
+   fs_reg tmp(e, glsl_type::uint_type);
+
+   /* Consider levels 0-2. Levels one and two are just below level zero. */
+   fs_inst *inst = e->emit(BRW_OPCODE_CMP, reg_null_d, level, fs_reg(0));
+   inst->conditional_mod = BRW_CONDITIONAL_G;
+   e->emit(BRW_OPCODE_IF);
+   e->emit(e->ADD(y, height, fs_reg(align_h - 1)));
+   e->emit(e->AND(y, y, align_h_mask)); /* y = ALIGN(height, align_h) */
+   e->emit(BRW_OPCODE_ELSE);
+   e->emit(e->MOV(y, fs_reg(0)));
+   e->emit(BRW_OPCODE_ENDIF);
+
+   /* Iterate over levels greater than two. */
+   e->emit(e->SHR(curr_h, height, fs_reg(2)));
+   e->emit(BRW_OPCODE_DO);
+   inst = e->emit(BRW_OPCODE_CMP, reg_null_ud, level, fs_reg(2));
+   inst->conditional_mod = BRW_CONDITIONAL_LE;
+   inst = e->emit(BRW_OPCODE_BREAK);
+   inst->predicate = BRW_PREDICATE_NORMAL;
+
+   e->emit(e->ADD(tmp, curr_h, fs_reg(align_h - 1)));
+   e->emit(e->AND(tmp, tmp, align_h_mask));
+   e->emit(e->ADD(y, y, tmp)); /* y = y + ALIGN(curr_h, align_h) */
+
+   e->emit(e->SHR(curr_h, curr_h, fs_reg(1)));
+   e->emit(e->ADD(level, level, fs_reg(-1)));
+   e->emit(BRW_OPCODE_WHILE);
+}
+
+/**
+ * The x-coordinate is zero for levels zero and one. For the rest it is fixed
+ * to the half of the base width - levels starting from two are just right to
+ * the level one (which has half the width of level zero).
+ */
+static void
+emit_calc_level_x_offset(fs_emitter *e, const fs_reg& width,
+                         const fs_reg& level, const fs_reg& x)
+{
+   static const unsigned align_w = 8;
+   fs_inst *inst;
+
+   inst = e->emit(BRW_OPCODE_CMP, reg_null_f, level, fs_reg(2));
+   inst->conditional_mod = BRW_CONDITIONAL_GE;
+   e->emit(BRW_OPCODE_IF);
+   e->emit(e->SHR(x, width, fs_reg(1))); /* w / 2 */
+   e->emit(e->ADD(x, x, fs_reg(align_w - 1))); /* (w / 2) + (align_w - 1) */
+   e->emit(e->AND(x, x, fs_reg(0xfff8))); /* ALIGN(w / 2, align_w) */
+   e->emit(BRW_OPCODE_ELSE);
+   inst = e->emit(e->MOV(x, fs_reg(0)));
+   e->emit(BRW_OPCODE_ENDIF);
+}
+
+static void
+update_tex_base_dimensions(struct gl_context *ctx,
+                           struct brw_fragment_program *fp, unsigned s)
+{
+   const struct gl_program *prog = &fp->program.Base;
+   
+   if (!(prog->SamplersUsed & (1 << s)))
+      return;
+
+   const unsigned unit = prog->SamplerUnits[s];
+   struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
+   struct intel_texture_object *intelObj = intel_texture_object(tObj);
+   const struct intel_mipmap_tree *mt = intelObj->mt;
+
+   fp->tex_base_dimensions.base_width[s] = mt->logical_width0;
+   fp->tex_base_dimensions.base_height[s] = mt->logical_height0;
+}
+
+} /* empty namespace */
+
+fs_stencil_texturing::fs_stencil_texturing(fs_emitter *e,
+                                           unsigned base_level,
+                                           unsigned num_samples,
+                                           int sampler,
+                                           fs_reg *coord)
+   : e(e),
+     base_level(base_level),
+     num_samples(num_samples),
+     sampler(sampler),
+     coord(coord)
+{
+}
+
+void
+fs_stencil_texturing::offset_to_w_tiled_miplevel(const fs_reg& lod)
+{
+   fs_inst *inst;
+   fs_reg off(e, glsl_type::uint_type);
+
+   inst = e->emit(BRW_OPCODE_CMP, reg_null_ud, lod, fs_reg(0));
+   inst->conditional_mod = BRW_CONDITIONAL_G;
+   e->emit(BRW_OPCODE_IF);
+   emit_calc_level_y_offset(e, *base_h, lod, off);
+   e->emit(e->ADD(offset(*coord, 1), offset(*coord, 1), off));
+   emit_calc_level_x_offset(e, *base_w, lod, off);
+   e->emit(e->ADD(*coord, *coord, off));
+   e->emit(BRW_OPCODE_ENDIF);
+}
+
+/**
+ * In order to access individual mip-levels the program needs to know the
+ * unaligned original base level dimensions. Surface is configured for full
+ * slice (i.e., full miptree) dimensios and the program accesses a particular
+ * level manually by modifying the texel coordinates given to the sampling
+ * engine.
+ * These dimensions are supplied to the program in builtin uniforms which
+ * are maintained here.
+ */
+void
+fs_stencil_texturing::setup_base_level(struct brw_fragment_program *fp,
+                                       struct brw_stage_prog_data *prog_data)
+{
+   if (e->dispatch_width == 8) {
+      base_w = new(e->mem_ctx)
+                  fs_reg(UNIFORM, e->uniforms, BRW_REGISTER_TYPE_UD);
+      base_h = new(e->mem_ctx)
+                  fs_reg(UNIFORM, e->uniforms + 1, BRW_REGISTER_TYPE_UD);
+
+      /* Set uniform source locations for 3d-state setup. */
+      prog_data->param[e->uniforms++] =
+         (const float *)&fp->tex_base_dimensions.base_width[sampler];
+      prog_data->param[e->uniforms++] =
+         (const float *)&fp->tex_base_dimensions.base_height[sampler];
+
+      update_tex_base_dimensions(&e->brw->ctx, fp, sampler);
+
+      /* Rest of the fragment compiler uses pointers of ir_variable as keys
+       * but any unique pointer in fact will do. Here will use pointers to the
+       * storage of the values.
+       */
+      hash_table_insert(e->variable_ht, base_w,
+                        &fp->tex_base_dimensions.base_width[sampler]);
+      hash_table_insert(e->variable_ht, base_h,
+                        &fp->tex_base_dimensions.base_height[sampler]);
+   } else {
+     base_w = (fs_reg *)hash_table_find(
+                 e->variable_ht,
+                 &fp->tex_base_dimensions.base_width[sampler]);
+     base_h = (fs_reg *)hash_table_find(
+                 e->variable_ht,
+                 &fp->tex_base_dimensions.base_height[sampler]);
+   }
+}
+
+void
+fs_stencil_texturing::emit_w_to_y_tiling(struct brw_fragment_program *fp,
+                                         struct brw_stage_prog_data *prog_data,
+                                         enum ir_texture_opcode op,
+                                         const fs_reg& lod,
+                                         const fs_reg& sample_index)
+{
+   fs_reg lod_ud;
+
+   if (op != ir_txf_ms) {
+      setup_base_level(fp, prog_data);
+
+      /* Adjust level of detail to take into account texture setting. */
+      lod_ud = fs_reg(e, glsl_type::uint_type);
+      e->emit(e->MOV(lod_ud, lod));
+      e->emit(e->ADD(lod_ud, lod_ud, fs_reg(base_level)));
+   }
+
+   /* Surface is sampled as 2x2 blocks. The coordinates will modified
+    * accordingly and the lowest bits designating the inidividual sample/pixel
+    * need to be saved for final pixel selection.
+    */
+   x_lowest_bit = fs_reg(e, glsl_type::uint_type);
+   y_lowest_bit = fs_reg(e, glsl_type::uint_type);
+   e->emit(e->AND(x_lowest_bit, *coord, fs_reg(0x1)));
+   e->emit(e->AND(y_lowest_bit, offset(*coord, 1), fs_reg(0x1)));
+
+   if (op == ir_txf_ms) {
+      assert(!"Multisampled stencil texturing is not supported");
+   } else {
+      offset_to_w_tiled_miplevel(lod_ud);
+   
+      /* Point the sampling engine to the beginning of the buffer. */
+      e->emit(e->MOV(lod, fs_reg(0)));
+   }
+
+   emit_translate_w_to_y_tiling(e, *coord);
+
+   /* Modify the pixel coordinates to point to the 2x2 block. */
+   e->emit(e->SHR(*coord, *coord, 2));
+}
+
+void
+fs_stencil_texturing::emit_pick_w_tiled_sample(const fs_reg& samples,
+                                               const fs_reg& sample_index)
+{
+   fs_inst *inst;
+
+   inst = e->emit(BRW_OPCODE_CMP, reg_null_f, y_lowest_bit, fs_reg(0));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   inst = e->emit(e->SHR(samples, samples, fs_reg(16)));
+   inst->predicate = BRW_PREDICATE_NORMAL;
+
+   inst = e->emit(BRW_OPCODE_CMP, reg_null_f, x_lowest_bit, fs_reg(0));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   inst = e->emit(e->SHR(samples, samples, fs_reg(8)));
+   inst->predicate = BRW_PREDICATE_NORMAL;
+
+   e->emit(e->AND(samples, samples, fs_reg(0xff)));
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_stencil_tex.h b/src/mesa/drivers/dri/i965/brw_fs_stencil_tex.h
new file mode 100644
index 0000000..5c7c42f
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_stencil_tex.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_STENCIL_TEX_H
+#define BRW_FS_STENCIL_TEX_H
+
+#include "brw_fs_emit.h"
+
+class fs_stencil_texturing {
+public:
+   fs_stencil_texturing(fs_emitter *e,
+                        unsigned base_level,
+                        unsigned num_samples,
+                        int sampler,
+                        fs_reg *coord);
+
+   void emit_w_to_y_tiling(struct brw_fragment_program *fp,
+                           struct brw_stage_prog_data *prog_data,
+                           enum ir_texture_opcode op,
+                           const fs_reg& lod, const fs_reg& sample_index);
+
+   void emit_pick_w_tiled_sample(const fs_reg& samples,
+                                 const fs_reg& sample_index);
+
+private:
+   void setup_base_level(struct brw_fragment_program *fp,
+                         struct brw_stage_prog_data *prog_data);
+   void offset_to_w_tiled_miplevel(const fs_reg& lod);
+   
+   fs_emitter *e;
+   const unsigned base_level;
+   const unsigned num_samples;
+   const int sampler;
+   fs_reg *coord;
+
+   /**
+    * Registers holding the dimensions of the base level of the texture being
+    * sampled. These are needed to resolve mip-level offsets and clamping
+    * coordinates.
+    */
+   fs_reg *base_w;
+   fs_reg *base_h;
+
+   /**
+    * Registers holding the least significant bits of the pixel coordinates.
+    * These are stored before coordinates are manipulated to designate 2x2
+    * blocks instead of individual pixels and used for the final sample
+    * selection.
+    */
+   fs_reg x_lowest_bit;
+   fs_reg y_lowest_bit;
+};
+
+#endif /* BRW_FS_STENCIL_TEX_H */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 2d9f421..f6bb010 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1694,6 +1694,14 @@ fs_visitor::visit(ir_texture *ir)
       assert(!"Unrecognized texture opcode");
    };
 
+   fs_stencil_texturing stencil_tex(this,
+                                    key->tex.w_tiled_base_level[sampler],
+                                    key->tex.num_w_tiled_samples[sampler],
+                                    sampler, &coordinate);
+   if (key->tex.num_w_tiled_samples[sampler])
+      stencil_tex.emit_w_to_y_tiling(brw_fragment_program(fp), stage_prog_data,
+                                     ir->op, lod, sample_index);
+
    /* Writemasking doesn't eliminate channels on SIMD8 texture
     * samples, so don't worry about them.
     */
@@ -1736,6 +1744,9 @@ fs_visitor::visit(ir_texture *ir)
       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], dst);
    }
 
+   if (key->tex.num_w_tiled_samples[sampler])
+      stencil_tex.emit_pick_w_tiled_sample(dst, sample_index);
+
    swizzle_result(ir, dst, sampler);
 }
 
-- 
1.8.3.1