[Mesa-dev] [RFC 21/27] i965/blorp: Refactor w-tiling to y-tiling translation

Sat Feb 22 01:05:47 PST 2014

Signed-off-by: Topi Pohjolainen <topi.pohjolainen at intel.com>
---
 src/mesa/drivers/dri/i965/brw_blorp_blit.cpp |  23 +-----
 src/mesa/drivers/dri/i965/brw_fs.h           |   7 ++
 src/mesa/drivers/dri/i965/brw_fs_emitter.cpp | 110 +++++++++++++++++++++++++++
 3 files changed, 118 insertions(+), 22 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index 9745c28..d8dc49b 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -1177,28 +1177,7 @@ brw_blorp_blit_program::translate_tiling(bool old_tiled_w, bool new_tiled_w)
       emit(OR(Yp, t1, t2));
       SWAP_XY_AND_XPYP();
    } else {
-      /* Applying the same logic as above, but in reverse, we obtain the
-       * formulas:
-       *
-       * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
-       * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
-       */
-      emit(AND(t1, X, brw_imm_uw(0xfffa))); /* X & ~0b101 */
-      emit(SHL(t1, t1, brw_imm_uw(1))); /* (X & ~0b101) << 1 */
-      emit(AND(t2, Y, brw_imm_uw(2))); /* Y & 0b10 */
-      emit(SHL(t2, t2, brw_imm_uw(2))); /* (Y & 0b10) << 2 */
-      emit(OR(t1, t1, t2)); /* (X & ~0b101) << 1 | (Y & 0b10) << 2 */
-      emit(AND(t2, Y, brw_imm_uw(1))); /* Y & 0b1 */
-      emit(SHL(t2, t2, brw_imm_uw(1))); /* (Y & 0b1) << 1 */
-      emit(OR(t1, t1, t2)); /* (X & ~0b101) << 1 | (Y & 0b10) << 2
-                                  | (Y & 0b1) << 1 */
-      emit(AND(t2, X, brw_imm_uw(1))); /* X & 0b1 */
-      emit(OR(Xp, t1, t2));
-      emit(AND(t1, Y, brw_imm_uw(0xfffc))); /* Y & ~0b11 */
-      emit(SHR(t1, t1, brw_imm_uw(1))); /* (Y & ~0b11) >> 1 */
-      emit(AND(t2, X, brw_imm_uw(4))); /* X & 0b100 */
-      emit(SHR(t2, t2, brw_imm_uw(2))); /* (X & 0b100) >> 2 */
-      emit(OR(Yp, t1, t2));
+      emit_translate_w_to_y_tiling(t1, t2, X, Y, Xp, Yp);
       SWAP_XY_AND_XPYP();
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 5b0687e..e02c025 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -290,6 +290,13 @@ protected:
    fs_inst *emit(fs_inst *inst);
    void emit(exec_list list);
 
+   void emit_translate_w_to_y_tiling(const fs_reg& t1,
+                                     const fs_reg& t2,
+                                     const fs_reg& src_x,
+                                     const fs_reg& src_y,
+                                     const fs_reg& dst_x,
+                                     const fs_reg& dst_y);
+
    void push_force_uncompressed();
    void pop_force_uncompressed();
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emitter.cpp b/src/mesa/drivers/dri/i965/brw_fs_emitter.cpp
index 6f1e2dd..6ba6516 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emitter.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emitter.cpp
@@ -194,6 +194,116 @@ fs_emitter::pop_force_uncompressed()
    assert(force_uncompressed_stack >= 0);
 }
 
+/**
+ * Emit translation of pixel coordinates src_x and src_y in W-tiled layout
+ * to corresponding coordinates dst_x and dst_y in Y-tiled layout.
+ * The operation requires two temporary registers in addition to the source
+ * and destination. Note also that source and destination registers cannot
+ * overlap.
+ *
+ * Both W-tiling and Y-tiling have equal tile size of one page. The difference
+ * is in how pixels are organised within the page: W-tile has 64 rows each
+ * holding in turn two 32 byte sub-tiles whereas Y-tile has 32 rows each
+ * holding eight 16 byte sub-tiles. The sub-tiles in turn have different
+ * layout: W is 8x4 bytes and Y is 16x1. Now, in Y-tiling two subsequent tiles
+ * are on top of each other. If each pair is thought to form one tile instead
+ * one can think Y-tiling to consist of 16 rows and eight columns of 32-byte
+ * subtiles.
+ *
+ * This organisation is independent of the pixel format used and
+ * hence the number of pixels within a tile varies based on how many bytes
+ * per pixel are needed.
+ *
+ * The operation here is fixed to one-byte-per-pixel formats only - it
+ * assumes that Y-subtile holds 16 pixels per row (and W 8 respectively).
+ * 
+ * First examine the X coordinate representing an address using W-tiling.
+ * The lowest six bits represent a column within a tile while the higher bits
+ * designate a tile number horizontally.
+ * As a Y-tile can hold twice as many pixels horizontally than W-tile, the
+ * tile number needs to be multiplied by two in order to move to the desired
+ * tile horizontally:
+ *
+ *    (X & ~0b111) << 1  ==  (X & 0xFFF8) << 1           (1)
+ *
+ * The lowest six can be further divided in two parts - the subtile number
+ * and then the remaining coordinate within the subtile. These are three and
+ * three respectively for W-tiling. Then unlike Y-subtile W is further
+ * divided into 4x4 and again into 2x2 tiles. Hence the third lowest bit
+ * represents the 4x4-subtile number, the second lowest the 2x2-subtile
+ * number and finally the lowest the offset within the 2x2 block.
+ *
+ *     01 23  45 67        0123456789ABCDEF              W      Y
+ *   ++==+==++==+==++      +---------------+       a:   3,1    7,0
+ * 0 ||  |  ||  |  ||    0 |       a       |       b:   1,2    1,1
+ * 1 ||  | a||  |  ||    1 | b       c     |       c:   5,3   10,1
+ *   ++--+--++--+--++      +---------------+
+ * 2 || b|  ||  |  ||
+ * 3 ||  |  ||c |  ||
+ *   ++==+==++==+==++
+ *
+ * Observing the W layout it can be seen that x-coordinates creater or equal
+ * to four reside on the second half of the subtile - in Y-tile this
+ * corresponds to the second row calling for the following compensation in
+ * the vertical coordinate:
+ *
+ *    (X & 0b100) >> 2   ==   (X & 0x4) >> 2              (2)
+ *
+ * The 2x2 subtile in turn results into two x-coordinates x and x + 2 in the
+ * same row (in the same 8x4 subtile) to be 4 bytes apart in linear memory.
+ * As addresses in Y-subtile itself are linear, the compensation in the
+ * horizontal coordinate is:
+ *
+ *    (X & 0b01) << 1                                     (3) 
+ *
+ * Combined with (1):
+ *
+ *    (X & ~0b101) << 1  ==  (X & 0xFFFA) << 1            (4)
+ *
+ * Similarly in 4x4 W-subtile in the same column any two y-coordinates y and
+ * y + 2 are 8 bytes apart in linear memory addresses. In 2x2-subtile in turn
+ * y and y + 1 are two bytes apart. This results into horizontal compensation
+ * in Y layout:
+ *
+ *    (Y & 0b10) << 2 | (Y & 0b1) << 1                    (5)
+ *
+ * Taking into account (4) and (5), one gets:
+ *
+ *    X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
+ *
+ * For y-coordinate one needs to consider full tiles and (2). As Y-layout
+ * has twice as many tiles as W horizontally, the number of tiles in vertical
+ * direction needs to be divided by two. Every two tiles on top each other
+ * in W-layout are "re-layouted" horiontally.
+ *
+ *    Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
+ */
+void
+fs_emitter::emit_translate_w_to_y_tiling(const fs_reg& t1,
+                                         const fs_reg& t2,
+                                         const fs_reg& src_x,
+                                         const fs_reg& src_y,
+                                         const fs_reg& dst_x,
+                                         const fs_reg& dst_y)
+{
+   emit(AND(t1, src_x, brw_imm_uw(0xfffa))); /* X & ~0b101 */
+   emit(SHL(t1, t1, brw_imm_uw(1))); /* (X & ~0b101) << 1 */
+   emit(AND(t2, src_y, brw_imm_uw(2))); /* Y & 0b10 */
+   emit(SHL(t2, t2, brw_imm_uw(2))); /* (Y & 0b10) << 2 */
+   emit(OR(t1, t1, t2)); /* (X & ~0b101) << 1 | (Y & 0b10) << 2 */
+   emit(AND(t2, src_y, brw_imm_uw(1))); /* Y & 0b1 */
+   emit(SHL(t2, t2, brw_imm_uw(1))); /* (Y & 0b1) << 1 */
+   emit(OR(t1, t1, t2)); /* (X & ~0b101) << 1 | (Y & 0b10) << 2
+                               | (Y & 0b1) << 1 */
+   emit(AND(t2, src_x, brw_imm_uw(1))); /* X & 0b1 */
+   emit(OR(dst_x, t1, t2));
+   emit(AND(t1, src_y, brw_imm_uw(0xfffc))); /* Y & ~0b11 */
+   emit(SHR(t1, t1, brw_imm_uw(1))); /* (Y & ~0b11) >> 1 */
+   emit(AND(t2, src_x, brw_imm_uw(4))); /* X & 0b100 */
+   emit(SHR(t2, t2, brw_imm_uw(2))); /* (X & 0b100) >> 2 */
+   emit(OR(dst_y, t1, t2));
+}
+
 fs_emitter::fs_emitter(struct brw_context *brw,
                        struct brw_wm_compile *_c,
                        unsigned _dispatch_width)
-- 
1.8.3.1