[Mesa-dev] [PATCH 2/8] i965: Micro-optimize brw_get_index_type

Ian Romanick idr at freedesktop.org
Fri Dec 19 14:20:53 PST 2014


From: Ian Romanick <ian.d.romanick at intel.com>

With the switch-statement, GCC 4.8.3 produces a small pile of code with
a branch.

00000000 <brw_get_index_type>:
  000000:       8b 54 24 04             mov    0x4(%esp),%edx
  000004:       b8 01 00 00 00          mov    $0x1,%eax
  000009:       81 fa 03 14 00 00       cmp    $0x1403,%edx
  00000f:       74 0d                   je     00001e <brw_get_index_type+0x1e>
  000011:       31 c0                   xor    %eax,%eax
  000013:       81 fa 05 14 00 00       cmp    $0x1405,%edx
  000019:       0f 94 c0                sete   %al
  00001c:       01 c0                   add    %eax,%eax
  00001e:       c3                      ret

However, this could be two instructions.

00000000 <brw_get_index_type>:
  000000:       2d 01 14 00 00          sub    $0x1401,%eax
  000005:       d1 e8                   shr    %eax
  000007:       90                      nop
  000008:       90                      nop
  000009:       90                      nop
  00000a:       90                      nop
  00000b:       c3                      ret

The function was also moved to the header so that it could be inlined at
the two call sites.  Without this, 32-bit also needs to pull the
parameter from the stack.  This means there is a push, a call, a move,
and a ret added to a two instruction function.  The above code shows the
function with __attribute__((regparm=1)), but even this adds several
extra instructions.  There is also an extra instruction on 64-bit to
move the parameter to %eax for the subtract.

On Bay Trail-D using Fedora 20 compile flags (-m64 -O2 -mtune=generic
for 64-bit and -m32 -march=i686 -mtune=atom for 32-bit), affects
Gl32Batch7:

32-bit: Difference at 95.0% confidence 0.818589% +/- 0.234661% (n=40)
64-bit: Difference at 95.0% confidence 0.54554% +/- 0.354092% (n=40)

Signed-off-by: Ian Romanick <ian.d.romanick at intel.com>
---
 src/mesa/drivers/dri/i965/brw_context.h      | 22 +++++++++++++++++++++-
 src/mesa/drivers/dri/i965/brw_draw_upload.c  | 13 +------------
 src/mesa/drivers/dri/i965/gen8_draw_upload.c |  2 +-
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index a63c483..92eb022 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1602,7 +1602,27 @@ gl_clip_plane *brw_select_clip_planes(struct gl_context *ctx);
 /* brw_draw_upload.c */
 unsigned brw_get_vertex_surface_type(struct brw_context *brw,
                                      const struct gl_client_array *glarray);
-unsigned brw_get_index_type(GLenum type);
+
+static inline unsigned
+brw_get_index_type(GLenum type)
+{
+   assert((type == GL_UNSIGNED_BYTE)
+          || (type == GL_UNSIGNED_SHORT)
+          || (type == GL_UNSIGNED_INT));
+
+   /* The possible values for type are GL_UNSIGNED_BYTE (0x1401),
+    * GL_UNSIGNED_SHORT (0x1403), and GL_UNSIGNED_INT (0x1405) which we want
+    * to map to scale factors of 0, 1, and 2, respectively.  These scale
+    * factors are then left-shfited by 8 to be in the correct position in the
+    * CMD_INDEX_BUFFER packet.
+    *
+    * Subtracting 0x1401 gives 0, 2, and 4.  Shifting left by 7 afterwards
+    * gives 0x00000000, 0x00000100, and 0x00000200.  These just happen to be
+    * the values the need to be written in the CMD_INDEX_BUFFER packet.
+    */
+   return (type - 0x1401) << 7;
+}
+
 void brw_prepare_vertices(struct brw_context *brw);
 
 /* brw_wm_surface_state.c */
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 6e0cf3e..e46c54b 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -344,17 +344,6 @@ brw_get_vertex_surface_type(struct brw_context *brw,
    }
 }
 
-unsigned
-brw_get_index_type(GLenum type)
-{
-   switch (type) {
-   case GL_UNSIGNED_BYTE:  return BRW_INDEX_BYTE;
-   case GL_UNSIGNED_SHORT: return BRW_INDEX_WORD;
-   case GL_UNSIGNED_INT:   return BRW_INDEX_DWORD;
-   default: unreachable("not reached");
-   }
-}
-
 static void
 copy_array_to_vbo_array(struct brw_context *brw,
 			struct brw_vertex_element *element,
@@ -973,7 +962,7 @@ static void brw_emit_index_buffer(struct brw_context *brw)
    BEGIN_BATCH(3);
    OUT_BATCH(CMD_INDEX_BUFFER << 16 |
              cut_index_setting |
-             brw_get_index_type(index_buffer->type) << 8 |
+             brw_get_index_type(index_buffer->type) |
              1);
    OUT_RELOC(brw->ib.bo,
              I915_GEM_DOMAIN_VERTEX, 0,
diff --git a/src/mesa/drivers/dri/i965/gen8_draw_upload.c b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
index 0d6feec..1af90ec 100644
--- a/src/mesa/drivers/dri/i965/gen8_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
@@ -251,7 +251,7 @@ gen8_emit_index_buffer(struct brw_context *brw)
 
    BEGIN_BATCH(5);
    OUT_BATCH(CMD_INDEX_BUFFER << 16 | (5 - 2));
-   OUT_BATCH(brw_get_index_type(index_buffer->type) << 8 | mocs_wb);
+   OUT_BATCH(brw_get_index_type(index_buffer->type) | mocs_wb);
    OUT_RELOC64(brw->ib.bo, I915_GEM_DOMAIN_VERTEX, 0, 0);
    OUT_BATCH(brw->ib.bo->size);
    ADVANCE_BATCH();
-- 
1.8.1.4



More information about the mesa-dev mailing list