[Mesa-dev] [PATCH 5/5] i965: Optimize batchbuffer macros.

Mon Jul 13 14:53:11 PDT 2015

On Sat, Jul 11, 2015 at 11:02 AM, Chris Wilson <chris at chris-wilson.co.uk> wrote:
> On Fri, Jul 10, 2015 at 11:44:59AM -0700, Matt Turner wrote:
>> Previously OUT_BATCH was just a macro around an inline function which
>> does
>>
>>    brw->batch.map[brw->batch.used++] = dword;
>>
>> When making consecutive calls to intel_batchbuffer_emit_dword() the
>> compiler isn't able to recognize that we're writing consecutive memory
>> locations or that it doesn't need to write batch.used back to memory
>> each time.
>>
>> We can avoid both of these problems by making a local pointer to the
>> next location in the batch in BEGIN_BATCH(), indexing it with a local
>> variable, and incrementing batch.used once in ADVANCE_BATCH().
>>
>> Cuts 18k from the .text size.
>>
>>    text     data      bss      dec      hex  filename
>> 4946956   195152    26192  5168300   4edcac  i965_dri.so before
>> 4928588   195152    26192  5149932   4e94ec  i965_dri.so after
>>
>> This series (including commit c0433948) improves performance of Synmark
>> OglBatch7 by 3.64514% +/- 0.298131% (n=282) on Ivybridge.
>> ---
>> That -4.19005% +/- 1.15188% (n=30) regression on Ivybridge is now a
>> performance improvement! Thanks Chris for the help!
>>
>>  src/mesa/drivers/dri/i965/intel_batchbuffer.c |  8 ++---
>>  src/mesa/drivers/dri/i965/intel_batchbuffer.h | 52 +++++++++++++++++++--------
>>  2 files changed, 41 insertions(+), 19 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
>> index f82958f..93f2872 100644
>> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
>> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
>> @@ -395,13 +395,13 @@ _intel_batchbuffer_flush(struct brw_context *brw,
>>   */
>>  uint32_t
>>  intel_batchbuffer_reloc(struct brw_context *brw,
>> -                        drm_intel_bo *buffer,
>> +                        drm_intel_bo *buffer, uint32_t offset,
>>                          uint32_t read_domains, uint32_t write_domain,
>>                          uint32_t delta)
>>  {
>>     int ret;
>>
>> -   ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
>> +   ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
>>                                buffer, delta,
>>                                read_domains, write_domain);
>>     assert(ret == 0);
>> @@ -416,11 +416,11 @@ intel_batchbuffer_reloc(struct brw_context *brw,
>>
>>  uint64_t
>>  intel_batchbuffer_reloc64(struct brw_context *brw,
>> -                          drm_intel_bo *buffer,
>> +                          drm_intel_bo *buffer, uint32_t offset,
>>                            uint32_t read_domains, uint32_t write_domain,
>>                            uint32_t delta)
>>  {
>> -   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
>> +   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
>>                                       buffer, delta,
>>                                       read_domains, write_domain);
>>     assert(ret == 0);
>> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
>> index c0456f3..6342c97 100644
>> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
>> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
>> @@ -59,14 +59,16 @@ void intel_batchbuffer_data(struct brw_context *brw,
>>
>>  uint32_t intel_batchbuffer_reloc(struct brw_context *brw,
>>                                   drm_intel_bo *buffer,
>> +                                 uint32_t offset,
>>                                   uint32_t read_domains,
>>                                   uint32_t write_domain,
>> -                                 uint32_t offset);
>> +                                 uint32_t delta);
>>  uint64_t intel_batchbuffer_reloc64(struct brw_context *brw,
>>                                     drm_intel_bo *buffer,
>> +                                   uint32_t offset,
>>                                     uint32_t read_domains,
>>                                     uint32_t write_domain,
>> -                                   uint32_t offset);
>> +                                   uint32_t delta);
>>  static inline uint32_t float_as_int(float f)
>>  {
>>     union {
>> @@ -160,23 +162,43 @@ intel_batchbuffer_advance(struct brw_context *brw)
>>  #endif
>>  }
>>
>> -#define BEGIN_BATCH(n) intel_batchbuffer_begin(brw, n, RENDER_RING)
>> -#define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
>> -#define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
>> -#define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
>> -#define OUT_RELOC(buf, read_domains, write_domain, delta)                  \
>> -   OUT_BATCH(intel_batchbuffer_reloc(brw, buf, read_domains, write_domain, \
>> -                                     delta))
>> +#define BEGIN_BATCH(n) do {                            \
>> +   intel_batchbuffer_begin(brw, (n), RENDER_RING);     \
>> +   uint32_t *__map = &brw->batch.map[brw->batch.used]; \
>> +   int __idx = 0, UNUSED __final_idx = (n)
>> +
>> +#define BEGIN_BATCH_BLT(n) do {                        \
>> +   intel_batchbuffer_begin(brw, (n), BLT_RING);        \
>> +   uint32_t *__map = &brw->batch.map[brw->batch.used]; \
>> +   int __idx = 0, UNUSED __final_idx = (n)
>
> From your inspiration, I used
> uint32_t *__map = brw->batch.ptr;
> brw->batch.ptr += count;
>
> And then emit_dword: *__map++ = dw;
>
> This allows us to use __map as a parameter to those functions
> (set_blitter_tiling, emit_vertex_state) i.e instead of having to convert
> them to a macro, you can just:
>
> diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> index 0606a77..41155a9 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> @@ -602,8 +602,9 @@ brw_prepare_shader_draw_parameters(struct brw_context *brw)
>  /**
>   * Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
>   */
> -static void
> +static uint32_t *
>  emit_vertex_buffer_state(struct brw_context *brw,
> +                         uint32_t *out,
>                           unsigned buffer_nr,
>                           brw_bo *bo,
>                           unsigned bo_ending_address,
> @@ -641,6 +642,8 @@ emit_vertex_buffer_state(struct brw_context *brw,
>        OUT_BATCH(0);
>     }
>     OUT_BATCH(step_rate);
> +
> +   return out;
>  }
>
> For the idx version, you can just pass idx in/out ofc.
>
> Then you also don't have to track __final_idx for non-debug builds as the
> check is just assert(__map == brw->batch.map);
>
> The only catch is changing OUT_RELOC to use __map - brw->batch.base, but
> you have to change OUT_RELOC for idx as well.
>
> Overall, given the locals gcc is able to generate efficient code for
> either *ptr++ or ptr[idx++], so it is a question of what will be easier
> to use in the future. For that I think you would like functions
> whereever possible.

Thanks. I've sent a v3 which goes back to using a direct pointer to
the next location which allows us to avoid making things macros.