Mesa (master): vc4: Optimize CL emits by doing size checks up front.

Eric Anholt anholt at kemper.freedesktop.org
Wed Dec 24 20:28:48 UTC 2014


Module: Mesa
Branch: master
Commit: 229bf4475ff0a5dbeb9bc95250f7a40a983c2e28
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=229bf4475ff0a5dbeb9bc95250f7a40a983c2e28

Author: Eric Anholt <eric at anholt.net>
Date:   Mon Dec 22 10:09:10 2014 -0800

vc4: Optimize CL emits by doing size checks up front.

The optimizer obviously doesn't have the ability to rewrite these to skip
the size checks per call, so we have to do it manually.

Improves a norast benchmark on simulation by 0.779706% +/- 0.405838%
(n=6087).

---

 src/gallium/drivers/vc4/vc4_cl.c      |   12 ++++++++----
 src/gallium/drivers/vc4/vc4_cl.h      |   17 +++++++----------
 src/gallium/drivers/vc4/vc4_context.c |   20 ++++++++++++++++++--
 src/gallium/drivers/vc4/vc4_draw.c    |   30 ++++++++++++++++++++++++++++++
 src/gallium/drivers/vc4/vc4_program.c |    3 +++
 5 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_cl.c b/src/gallium/drivers/vc4/vc4_cl.c
index 36dd28c..0700e88 100644
--- a/src/gallium/drivers/vc4/vc4_cl.c
+++ b/src/gallium/drivers/vc4/vc4_cl.c
@@ -29,17 +29,21 @@ void
 vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl)
 {
         cl->base = ralloc_size(vc4, 1);
-        cl->end = cl->next = cl->base;
+        cl->next = cl->base;
+        cl->size = 0;
 }
 
 void
-vc4_grow_cl(struct vc4_cl *cl)
+cl_ensure_space(struct vc4_cl *cl, uint32_t space)
 {
-        uint32_t size = MAX2((cl->end - cl->base) * 2, 4096);
+        if ((cl->next - cl->base) + space <= cl->size)
+                return;
+
+        uint32_t size = MAX2(cl->size + space, cl->size * 2);
         uint32_t offset = cl->next -cl->base;
 
         cl->base = reralloc(ralloc_parent(cl->base), cl->base, uint8_t, size);
-        cl->end = cl->base + size;
+        cl->size = size;
         cl->next = cl->base + offset;
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index 86cd0c7..33b3729 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -35,13 +35,12 @@ struct vc4_bo;
 struct vc4_cl {
         void *base;
         void *next;
-        void *end;
+        uint32_t size;
         uint32_t reloc_next;
         uint32_t reloc_count;
 };
 
 void vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl);
-void vc4_grow_cl(struct vc4_cl *cl);
 void vc4_reset_cl(struct vc4_cl *cl);
 void vc4_dump_cl(void *cl, uint32_t size, bool is_render);
 uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo);
@@ -49,8 +48,7 @@ uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo);
 static inline void
 cl_u8(struct vc4_cl *cl, uint8_t n)
 {
-        if (cl->next + 1 > cl->end)
-                vc4_grow_cl(cl);
+        assert((cl->next - cl->base) + 1 <= cl->size);
 
         *(uint8_t *)cl->next = n;
         cl->next++;
@@ -59,8 +57,7 @@ cl_u8(struct vc4_cl *cl, uint8_t n)
 static inline void
 cl_u16(struct vc4_cl *cl, uint32_t n)
 {
-        if (cl->next + 2 > cl->end)
-                vc4_grow_cl(cl);
+        assert((cl->next - cl->base) + 2 <= cl->size);
 
         *(uint16_t *)cl->next = n;
         cl->next += 2;
@@ -69,8 +66,7 @@ cl_u16(struct vc4_cl *cl, uint32_t n)
 static inline void
 cl_u32(struct vc4_cl *cl, uint32_t n)
 {
-        if (cl->next + 4 > cl->end)
-                vc4_grow_cl(cl);
+        assert((cl->next - cl->base) + 4 <= cl->size);
 
         *(uint32_t *)cl->next = n;
         cl->next += 4;
@@ -79,8 +75,7 @@ cl_u32(struct vc4_cl *cl, uint32_t n)
 static inline void
 cl_ptr(struct vc4_cl *cl, void *ptr)
 {
-        if (cl->next + sizeof(void *) > cl->end)
-                vc4_grow_cl(cl);
+        assert((cl->next - cl->base) + sizeof(void *) <= cl->size);
 
         *(void **)cl->next = ptr;
         cl->next += sizeof(void *);
@@ -134,4 +129,6 @@ cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
         cl_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset);
 }
 
+void cl_ensure_space(struct vc4_cl *cl, uint32_t size);
+
 #endif /* VC4_CL_H */
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 906af05..d4a9eec 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -104,6 +104,22 @@ vc4_setup_rcl(struct vc4_context *vc4)
                 resolve_uncleared);
 #endif
 
+        uint32_t reloc_size = 9;
+        uint32_t clear_size = 14;
+        uint32_t config_size = 11 + reloc_size;
+        uint32_t loadstore_size = 7 + reloc_size;
+        uint32_t tilecoords_size = 3;
+        uint32_t branch_size = 5 + reloc_size;
+        uint32_t color_store_size = 1;
+        cl_ensure_space(&vc4->rcl,
+                        clear_size +
+                        config_size +
+                        loadstore_size +
+                        xtiles * ytiles * (loadstore_size * 4 +
+                                           tilecoords_size * 3 +
+                                           branch_size +
+                                           color_store_size));
+
         cl_u8(&vc4->rcl, VC4_PACKET_CLEAR_COLORS);
         cl_u32(&vc4->rcl, vc4->clear_color[0]);
         cl_u32(&vc4->rcl, vc4->clear_color[1]);
@@ -290,9 +306,9 @@ vc4_flush(struct pipe_context *pctx)
 
         if (vc4_debug & VC4_DEBUG_CL) {
                 fprintf(stderr, "BCL:\n");
-                vc4_dump_cl(vc4->bcl.base, vc4->bcl.end - vc4->bcl.base, false);
+                vc4_dump_cl(vc4->bcl.base, vc4->bcl.size, false);
                 fprintf(stderr, "RCL:\n");
-                vc4_dump_cl(vc4->rcl.base, vc4->rcl.end - vc4->rcl.base, true);
+                vc4_dump_cl(vc4->rcl.base, vc4->rcl.size, true);
         }
 
         struct drm_vc4_submit_cl submit;
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 79d7d73..d99faa4 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -29,6 +29,32 @@
 #include "vc4_context.h"
 #include "vc4_resource.h"
 
+static void
+vc4_get_draw_cl_space(struct vc4_context *vc4)
+{
+        /* Binner gets our packet state -- vc4_emit.c contents,
+         * and the primitive itself.
+         */
+        cl_ensure_space(&vc4->bcl, 256);
+
+        /* Nothing for rcl -- that's covered by vc4_context.c */
+
+        /* shader_rec gets up to 12 dwords of reloc handles plus a maximally
+         * sized shader_rec (104 bytes base for 8 vattrs plus 32 bytes of
+         * vattr stride).
+         */
+        cl_ensure_space(&vc4->shader_rec, 12 * sizeof(uint32_t) + 104 + 8 * 32);
+
+        /* Uniforms are covered by vc4_write_uniforms(). */
+
+        /* There could be up to 16 textures per stage, plus misc other
+         * pointers.
+         */
+        cl_ensure_space(&vc4->bo_handles, (2 * 16 + 20) * sizeof(uint32_t));
+        cl_ensure_space(&vc4->bo_pointers,
+                        (2 * 16 + 20) * sizeof(struct vc4_bo *));
+}
+
 /**
  * Does the initial bining command list setup for drawing to a given FBO.
  */
@@ -38,6 +64,8 @@ vc4_start_draw(struct vc4_context *vc4)
         if (vc4->needs_flush)
                 return;
 
+        vc4_get_draw_cl_space(vc4);
+
         uint32_t width = vc4->framebuffer.width;
         uint32_t height = vc4->framebuffer.height;
         uint32_t tilew = align(width, 64) / 64;
@@ -114,6 +142,8 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 return;
         }
 
+        vc4_get_draw_cl_space(vc4);
+
         struct vc4_vertex_stateobj *vtx = vc4->vtx;
         struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf;
 
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 4b547c5..570c76a 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2729,6 +2729,9 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
         const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
         struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
 
+        cl_ensure_space(&vc4->uniforms, (uinfo->count +
+                                         uinfo->num_texture_samples) * 4);
+
         cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples);
 
         for (int i = 0; i < uinfo->count; i++) {




More information about the mesa-commit mailing list