[Mesa-dev] [PATCH 5/5] i965: Optimize batchbuffer macros.
Chris Wilson
chris at chris-wilson.co.uk
Sat Jul 11 11:02:45 PDT 2015
On Fri, Jul 10, 2015 at 11:44:59AM -0700, Matt Turner wrote:
> Previously OUT_BATCH was just a macro around an inline function which
> does
>
> brw->batch.map[brw->batch.used++] = dword;
>
> When making consecutive calls to intel_batchbuffer_emit_dword() the
> compiler isn't able to recognize that we're writing consecutive memory
> locations or that it doesn't need to write batch.used back to memory
> each time.
>
> We can avoid both of these problems by making a local pointer to the
> next location in the batch in BEGIN_BATCH(), indexing it with a local
> variable, and incrementing batch.used once in ADVANCE_BATCH().
>
> Cuts 18k from the .text size.
>
> text data bss dec hex filename
> 4946956 195152 26192 5168300 4edcac i965_dri.so before
> 4928588 195152 26192 5149932 4e94ec i965_dri.so after
>
> This series (including commit c0433948) improves performance of Synmark
> OglBatch7 by 3.64514% +/- 0.298131% (n=282) on Ivybridge.
> ---
> That -4.19005% +/- 1.15188% (n=30) regression on Ivybridge is now a
> performance improvement! Thanks Chris for the help!
>
> src/mesa/drivers/dri/i965/intel_batchbuffer.c | 8 ++---
> src/mesa/drivers/dri/i965/intel_batchbuffer.h | 52 +++++++++++++++++++--------
> 2 files changed, 41 insertions(+), 19 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> index f82958f..93f2872 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> @@ -395,13 +395,13 @@ _intel_batchbuffer_flush(struct brw_context *brw,
> */
> uint32_t
> intel_batchbuffer_reloc(struct brw_context *brw,
> - drm_intel_bo *buffer,
> + drm_intel_bo *buffer, uint32_t offset,
> uint32_t read_domains, uint32_t write_domain,
> uint32_t delta)
> {
> int ret;
>
> - ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
> + ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
> buffer, delta,
> read_domains, write_domain);
> assert(ret == 0);
> @@ -416,11 +416,11 @@ intel_batchbuffer_reloc(struct brw_context *brw,
>
> uint64_t
> intel_batchbuffer_reloc64(struct brw_context *brw,
> - drm_intel_bo *buffer,
> + drm_intel_bo *buffer, uint32_t offset,
> uint32_t read_domains, uint32_t write_domain,
> uint32_t delta)
> {
> - int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
> + int ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
> buffer, delta,
> read_domains, write_domain);
> assert(ret == 0);
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> index c0456f3..6342c97 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> @@ -59,14 +59,16 @@ void intel_batchbuffer_data(struct brw_context *brw,
>
> uint32_t intel_batchbuffer_reloc(struct brw_context *brw,
> drm_intel_bo *buffer,
> + uint32_t offset,
> uint32_t read_domains,
> uint32_t write_domain,
> - uint32_t offset);
> + uint32_t delta);
> uint64_t intel_batchbuffer_reloc64(struct brw_context *brw,
> drm_intel_bo *buffer,
> + uint32_t offset,
> uint32_t read_domains,
> uint32_t write_domain,
> - uint32_t offset);
> + uint32_t delta);
> static inline uint32_t float_as_int(float f)
> {
> union {
> @@ -160,23 +162,43 @@ intel_batchbuffer_advance(struct brw_context *brw)
> #endif
> }
>
> -#define BEGIN_BATCH(n) intel_batchbuffer_begin(brw, n, RENDER_RING)
> -#define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
> -#define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
> -#define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
> -#define OUT_RELOC(buf, read_domains, write_domain, delta) \
> - OUT_BATCH(intel_batchbuffer_reloc(brw, buf, read_domains, write_domain, \
> - delta))
> +#define BEGIN_BATCH(n) do { \
> + intel_batchbuffer_begin(brw, (n), RENDER_RING); \
> + uint32_t *__map = &brw->batch.map[brw->batch.used]; \
> + int __idx = 0, UNUSED __final_idx = (n)
> +
> +#define BEGIN_BATCH_BLT(n) do { \
> + intel_batchbuffer_begin(brw, (n), BLT_RING); \
> + uint32_t *__map = &brw->batch.map[brw->batch.used]; \
> + int __idx = 0, UNUSED __final_idx = (n)
>From your inspiration, I used
uint32_t *__map = brw->batch.ptr;
brw->batch.ptr += count;
And then emit_dword: *__map++ = dw;
This allows us to use __map as a parameter to those functions
(set_blitter_tiling, emit_vertex_state) i.e instead of having to convert
them to a macro, you can just:
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 0606a77..41155a9 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -602,8 +602,9 @@ brw_prepare_shader_draw_parameters(struct brw_context *brw)
/**
* Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
*/
-static void
+static uint32_t *
emit_vertex_buffer_state(struct brw_context *brw,
+ uint32_t *out,
unsigned buffer_nr,
brw_bo *bo,
unsigned bo_ending_address,
@@ -641,6 +642,8 @@ emit_vertex_buffer_state(struct brw_context *brw,
OUT_BATCH(0);
}
OUT_BATCH(step_rate);
+
+ return out;
}
For the idx version, you can just pass idx in/out ofc.
Then you also don't have to track __final_idx for non-debug builds as the
check is just assert(__map == brw->batch.map);
The only catch is changing OUT_RELOC to use __map - brw->batch.base, but
you have to change OUT_RELOC for idx as well.
Overall, given the locals gcc is able to generate efficient code for
either *ptr++ or ptr[idx++], so it is a question of what will be easier
to use in the future. For that I think you would like functions
whereever possible.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
More information about the mesa-dev
mailing list