[Mesa-dev] [PATCH 16/20] i965/fs: Add support for W-tiled to linear coordinate translation

Fri Apr 11 00:28:56 PDT 2014

Signed-off-by: Topi Pohjolainen <topi.pohjolainen at intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.h           |  8 +++
 src/mesa/drivers/dri/i965/brw_fs_emitter.cpp | 93 ++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index eaa5332..0d9cbd1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -303,6 +303,14 @@ protected:
                                 const fs_reg& dst_x,
                                 const fs_reg& dst_y);
 
+   void emit_w_tiling_to_linear(const fs_reg& t1,
+                                const fs_reg& t2,
+                                const fs_reg& stride,
+                                const fs_reg& src_x,
+                                const fs_reg& src_y,
+                                const fs_reg& dst_x,
+                                const fs_reg& dst_y);
+
    void push_force_uncompressed();
    void pop_force_uncompressed();
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emitter.cpp b/src/mesa/drivers/dri/i965/brw_fs_emitter.cpp
index 0d5cfb4..88e898d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emitter.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emitter.cpp
@@ -294,6 +294,99 @@ fs_emitter::emit_linear_to_w_tiling(const fs_reg& t1,
       emit_coord_swizzling(t1, dst_x, dst_y);
 }
 
+/**
+ * Emit translation of coordinates src_x and src_y in W-tiled space into
+ * corresponding coordinates dst_x and dst_y in linear layout.
+ * Consider the tiling algorithm formula in Ironlake and Sandybridge PRMs:
+ * Volume 1, Part 2, Section 4.5.3. The equation translates coordinates x and
+ * y in w-tiling layout into corresponding byte offset in linear memory.
+ * The operation requires two temporary registers in addition to the source
+ * and destination. Note also that source and destination registers cannot
+ * overlap.
+ *
+ *  tile_x = x / 64
+ *  tile_y = y / 64
+ *  byte_x = x % 64
+ *  byte_y = y % 64
+ *
+ *  u = tile_y * 64 * stride               u = (y & 0xffc0) * stride
+ *    + tile_x * 4096                        + (x & 0xfff8) * 64
+ *    + 512 * (byte_x / 8)
+ *    +  64 * (byte_y / 8)                   + 8 * (y & 0x3c)
+ *    +  32 * ((byte_y / 4) % 2)          
+ *    +  16 * ((byte_x / 4) % 2)    <==>     + 4 * (x & 0x4)
+ *    +   8 * ((byte_y / 2) % 2)             + 4 * (y & 0x2)
+ *    +   4 * ((byte_x / 2) % 2)             + 2 * (x & 0x2)
+ *    +   2 * (byte_y % 2)                   + 2 * (y & 0x1)
+ *    +   1 * (byte_x % 2)                   + 1 * (x & 0x1)
+ *
+ *  where
+ *
+ *   8 * (y & 0x3c) = 8 * (y & 0x38) + 8 * (y & 0x4)
+ *                  = 64 * ((y % 64) % 8) + 8 * 4 * (((y % 64) / 4) % 2)
+ *
+ *   (x & 0xfff8) * 64 = 64 * (x & ~0x3f) + 64 * (x & 0x38)
+ *                     = 64 * 64 * (x / 64) + 64 * 8 * ((x % 64) / 8)
+ *
+ * The linear offset corresponds to linear coordinates x_p, y_p simply as:
+ * u = y_p * stride + x_p. Dividing both sides by stride and taking into
+ * account the integer rounding to zero yields:
+ *
+ *  y_p = (y & 0xffc0)
+ *      + ((x & 0xfff8) * 64 + 8 * (y & 0x3c)) / stride
+ *
+ *  x_p = 4 * (x & 0x4)
+ *      + 4 * (y & 0x2)
+ *      + 2 * (x & 0x2)
+ *      + 2 * (y & 0x1)
+ *      + 1 * (x & 0x1)
+ *      + ((x & 0xfff8) * 64 + 8 * (y & 0x3c)) % stride
+ */
+void
+fs_emitter::emit_w_tiling_to_linear(const fs_reg& t1,
+                                    const fs_reg& t2,
+                                    const fs_reg& stride,
+                                    const fs_reg& src_x,
+                                    const fs_reg& src_y,
+                                    const fs_reg& dst_x,
+                                    const fs_reg& dst_y)
+{
+   if (brw->has_swizzling)
+      emit_coord_swizzling(t1, src_x, src_y);
+
+   emit(AND(t1, src_y, brw_imm_uw(0x3c))); /* src_y & 0x3c */
+   emit(SHL(t1, t1, brw_imm_uw(3))); /* 8 * (src_y & 0x3c) */
+   emit(AND(t2, src_x, brw_imm_uw(0xfff8))); /* src_x & 0xfff8 */
+   emit(SHL(t2, t2, brw_imm_uw(6))); /* (src_x & 0xfff8) * 64 */
+   emit(ADD(t1, t1, t2)); /* (src_x & 0xfff8) * 64 + 8 * (src_y & 0x3c) */
+
+   /* On gen6 math needs a register with hstride == 1, make a copy. */
+   if (brw->gen == 6)
+      emit(MOV(t2, stride));
+   
+   /* ((src_x & 0xfff8) * 64 + 8 * (src_y & 0x3c)) / stride */
+   emit(SHADER_OPCODE_INT_QUOTIENT, dst_y, t1, brw->gen == 6 ? t2 : stride);
+   /* ((src_x & 0xfff8) * 64 + 8 * (src_y & 0x3c)) % stride */
+   emit(SHADER_OPCODE_INT_REMAINDER, dst_x, t1, brw->gen == 6 ? t2 : stride);
+
+   emit(AND(t1, src_y, brw_imm_uw(0xffc0))); /* src_y & 0xffc0 */
+   emit(ADD(dst_y, dst_y, t1)); /* dst_y += (src_y & 0xffc0) */
+   emit(AND(t1, src_x, brw_imm_uw(0x4))); /* src_x & 0x4 */
+   emit(SHL(t1, t1, brw_imm_uw(2))); /* (src_x & 0x4) * 4 */
+   emit(ADD(dst_x, dst_x, t1)); /* dst_x += ((src_x & 0x4) * 4) */
+   emit(AND(t1, src_y, brw_imm_uw(0x2))); /* src_y & 0x2 */
+   emit(SHL(t1, t1, brw_imm_uw(2))); /* (src_y & 0x2) * 4 */
+   emit(ADD(dst_x, dst_x, t1)); /* dst_x += ((src_y & 0x2) * 4) */
+   emit(AND(t1, src_x, brw_imm_uw(0x2))); /* src_x & 0x2 */
+   emit(SHL(t1, t1, brw_imm_uw(1))); /* (src_x & 0x2) * 2 */
+   emit(ADD(dst_x, dst_x, t1)); /* dst_x += ((src_x & 0x2) * 2) */
+   emit(AND(t1, src_y, brw_imm_uw(0x1))); /* src_y & 0x1 */
+   emit(SHL(t1, t1, brw_imm_uw(1))); /* (src_y & 0x1) * 2 */
+   emit(ADD(dst_x, dst_x, t1)); /* dst_x += ((src_y & 0x1) * 2) */
+   emit(AND(t1, src_x, brw_imm_uw(0x1))); /* src_x & 0x1 */
+   emit(ADD(dst_x, dst_x, t1)); /* dst_x += (src_x & 0x1) */
+}
+
 fs_emitter::fs_emitter(struct brw_context *brw,
                        struct brw_wm_compile *c,
                        unsigned dispatch_width)
-- 
1.8.3.1