[Mesa-dev] [PATCH 3/3] i965: Reimplement ARB_transform_feedback2 on Haswell and later.

Sat May 7 08:12:29 UTC 2016

On 2016-05-06 23:42:49, Kenneth Graunke wrote:
> My old implementation accumulated <start, end> pairs in a buffer,
> and eventually processed that data on the CPU.  This meant flushing
> the batchbuffer and waiting for it to completely execute before we
> could map it, resulting in really long stalls.  We could also run out
> of space in the buffer, and have to do this early.
> 
> Instead, we can use Haswell's MI_MATH command to do the (end - start)
> subtraction, as well as the multiplication by 2 or 3 to convert from
> the number of primitives written to the number of vertices written.
> We still need to CS stall to read the counters, but otherwise everything
> is completely pipelined - there's no CPU<->GPU synchronization required.
> It also uses only 80 bytes in the buffer, no matter what.
> 
> Improves performance in Manhattan on Skylake GT3e at 800x600 by
> 6.1086% +/- 0.954166% (n=9).  At 1920x1080, improves performance
> by 2.82103% +/- 0.148596% (n=84).
> 
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/Makefile.sources |   1 +
>  src/mesa/drivers/dri/i965/brw_context.c    |  14 +-
>  src/mesa/drivers/dri/i965/brw_context.h    |  14 ++
>  src/mesa/drivers/dri/i965/brw_draw.c       |  38 ++++-
>  src/mesa/drivers/dri/i965/hsw_sol.c        | 253 +++++++++++++++++++++++++++++
>  5 files changed, 308 insertions(+), 12 deletions(-)
>  create mode 100644 src/mesa/drivers/dri/i965/hsw_sol.c
> 
> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
> index 8c60954..d35775e 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -228,6 +228,7 @@ i965_FILES = \
>         gen8_vs_state.c \
>         gen8_wm_depth_stencil.c \
>         hsw_queryobj.c \
> +       hsw_sol.c \
>         intel_batchbuffer.c \
>         intel_batchbuffer.h \
>         intel_blit.c \
> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
> index 1380d41..2338514 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.c
> +++ b/src/mesa/drivers/dri/i965/brw_context.c
> @@ -372,13 +372,18 @@ brw_init_driver_functions(struct brw_context *brw,
>  
>     functions->NewTransformFeedback = brw_new_transform_feedback;
>     functions->DeleteTransformFeedback = brw_delete_transform_feedback;
> -   functions->GetTransformFeedbackVertexCount =
> -      brw_get_transform_feedback_vertex_count;
> -   if (brw->gen >= 7) {
> +   if (brw->intelScreen->has_mi_math) {
> +      functions->BeginTransformFeedback = hsw_begin_transform_feedback;
> +      functions->EndTransformFeedback = hsw_end_transform_feedback;
> +      functions->PauseTransformFeedback = hsw_pause_transform_feedback;
> +      functions->ResumeTransformFeedback = hsw_resume_transform_feedback;
> +   } else if (brw->gen >= 7) {
>        functions->BeginTransformFeedback = gen7_begin_transform_feedback;
>        functions->EndTransformFeedback = gen7_end_transform_feedback;
>        functions->PauseTransformFeedback = gen7_pause_transform_feedback;
>        functions->ResumeTransformFeedback = gen7_resume_transform_feedback;
> +      functions->GetTransformFeedbackVertexCount =
> +         brw_get_transform_feedback_vertex_count;
>     } else {
>        functions->BeginTransformFeedback = brw_begin_transform_feedback;
>        functions->EndTransformFeedback = brw_end_transform_feedback;
> @@ -494,7 +499,8 @@ brw_initialize_context_constants(struct brw_context *brw)
>     ctx->Const.MaxTransformFeedbackSeparateComponents =
>        BRW_MAX_SOL_BINDINGS / BRW_MAX_SOL_BUFFERS;
>  
> -   ctx->Const.AlwaysUseGetTransformFeedbackVertexCount = true;
> +   ctx->Const.AlwaysUseGetTransformFeedbackVertexCount =
> +      !brw->intelScreen->has_mi_math;
>  
>     int max_samples;
>     const int *msaa_modes = intel_supported_msaa_modes(brw->intelScreen);
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index b620f14..035cbe9 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1656,6 +1656,20 @@ void
>  gen7_resume_transform_feedback(struct gl_context *ctx,
>                                 struct gl_transform_feedback_object *obj);
>  
> +/* hsw_sol.c */
> +void
> +hsw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
> +                             struct gl_transform_feedback_object *obj);
> +void
> +hsw_end_transform_feedback(struct gl_context *ctx,
> +                           struct gl_transform_feedback_object *obj);
> +void
> +hsw_pause_transform_feedback(struct gl_context *ctx,
> +                             struct gl_transform_feedback_object *obj);
> +void
> +hsw_resume_transform_feedback(struct gl_context *ctx,
> +                              struct gl_transform_feedback_object *obj);
> +
>  /* brw_blorp_blit.cpp */
>  GLbitfield
>  brw_blorp_framebuffer(struct brw_context *brw,
> diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
> index afa8a4e..9d034cf 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw.c
> @@ -153,7 +153,9 @@ trim(GLenum prim, GLuint length)
>  static void
>  brw_emit_prim(struct brw_context *brw,
>                const struct _mesa_prim *prim,
> -              uint32_t hw_prim)
> +              uint32_t hw_prim,
> +              struct brw_transform_feedback_object *xfb_obj,
> +              unsigned stream)
>  {
>     int verts_per_instance;
>     int vertex_access_type;
> @@ -185,7 +187,7 @@ brw_emit_prim(struct brw_context *brw,
>        verts_per_instance = prim->count;
>  
>     /* If nothing to emit, just return. */
> -   if (verts_per_instance == 0 && !prim->is_indirect)
> +   if (verts_per_instance == 0 && !prim->is_indirect && !xfb_obj)
>        return;
>  
>     /* If we're set to always flush, do it before and after the primitive emit.
> @@ -197,7 +199,25 @@ brw_emit_prim(struct brw_context *brw,
>        brw_emit_mi_flush(brw);
>  
>     /* If indirect, emit a bunch of loads from the indirect BO. */
> -   if (prim->is_indirect) {
> +   if (xfb_obj) {
> +      indirect_flag = GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE;
> +
> +      brw_load_register_mem(brw, GEN7_3DPRIM_VERTEX_COUNT,
> +                            xfb_obj->prim_count_bo,
> +                            I915_GEM_DOMAIN_VERTEX, 0,
> +                            stream * sizeof(uint32_t));
> +      BEGIN_BATCH(9);
> +      OUT_BATCH(MI_LOAD_REGISTER_IMM | (9 - 2));
> +      OUT_BATCH(GEN7_3DPRIM_INSTANCE_COUNT);
> +      OUT_BATCH(prim->num_instances);
> +      OUT_BATCH(GEN7_3DPRIM_START_VERTEX);
> +      OUT_BATCH(0);
> +      OUT_BATCH(GEN7_3DPRIM_BASE_VERTEX);
> +      OUT_BATCH(0);
> +      OUT_BATCH(GEN7_3DPRIM_START_INSTANCE);
> +      OUT_BATCH(0);
> +      ADVANCE_BATCH();
> +   } else if (prim->is_indirect) {
>        struct gl_buffer_object *indirect_buffer = brw->ctx.DrawIndirectBuffer;
>        drm_intel_bo *bo = intel_bufferobj_buffer(brw,
>              intel_buffer_object(indirect_buffer),
> @@ -382,6 +402,8 @@ brw_try_draw_prims(struct gl_context *ctx,
>                     const struct _mesa_index_buffer *ib,
>                     GLuint min_index,
>                     GLuint max_index,
> +                   struct brw_transform_feedback_object *xfb_obj,
> +                   unsigned stream,
>                     struct gl_buffer_object *indirect)
>  {
>     struct brw_context *brw = brw_context(ctx);
> @@ -531,7 +553,7 @@ retry:
>          brw_upload_render_state(brw);
>        }
>  
> -      brw_emit_prim(brw, &prims[i], brw->primitive);
> +      brw_emit_prim(brw, &prims[i], brw->primitive, xfb_obj, stream);
>  
>        brw->no_batch_wrap = false;
>  
> @@ -573,14 +595,14 @@ brw_draw_prims(struct gl_context *ctx,
>                 GLboolean index_bounds_valid,
>                 GLuint min_index,
>                 GLuint max_index,
> -               struct gl_transform_feedback_object *unused_tfb_object,
> +               struct gl_transform_feedback_object *gl_xfb_obj,
>                 unsigned stream,
>                 struct gl_buffer_object *indirect)
>  {
>     struct brw_context *brw = brw_context(ctx);
>     const struct gl_client_array **arrays = ctx->Array._DrawArrays;
> -
> -   assert(unused_tfb_object == NULL);
> +   struct brw_transform_feedback_object *xfb_obj =
> +      (struct brw_transform_feedback_object *) gl_xfb_obj;
>  
>     if (!brw_check_conditional_render(brw))
>        return;
> @@ -619,7 +641,7 @@ brw_draw_prims(struct gl_context *ctx,
>      * to it.
>      */
>     brw_try_draw_prims(ctx, arrays, prims, nr_prims, ib, min_index, max_index,
> -                      indirect);
> +                      xfb_obj, stream, indirect);
>  }
>  
>  void
> diff --git a/src/mesa/drivers/dri/i965/hsw_sol.c b/src/mesa/drivers/dri/i965/hsw_sol.c
> new file mode 100644
> index 0000000..657f863
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/hsw_sol.c
> @@ -0,0 +1,253 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +/**
> + * An implementation of the transform feedback driver hooks for Haswell
> + * and later hardware.  This uses MI_MATH to compute the number of vertices
> + * written (for use by DrawTransformFeedback()) without any CPU<->GPU
> + * synchronization which could stall.
> + */
> +
> +#include "brw_context.h"
> +#include "brw_state.h"
> +#include "brw_defines.h"
> +#include "intel_batchbuffer.h"
> +#include "intel_buffer_objects.h"
> +#include "main/transformfeedback.h"
> +
> +/**
> + * We store several values in obj->prim_count_bo:
> + *
> + * [4x 32-bit values]: Final Number of Vertices Written
> + * [4x 32-bit values]: Tally of Primitives Written So Far
> + * [4x 64-bit values]: Starting SO_NUM_PRIMS_WRITTEN Counter Snapshots
> + *
> + * The first set of values is used by DrawTransformFeedback(), which
> + * copies one of them into the 3DPRIM_VERTEX_COUNT register and performs
> + * an indirect draw.  The other values are just temporary storage.
> + */
> +
> +#define TALLY_OFFSET (BRW_MAX_XFB_STREAMS * sizeof(uint32_t))
> +#define START_OFFSET (TALLY_OFFSET * 2)
> +
> +/**
> + * Store the SO_NUM_PRIMS_WRITTEN counters for each stream (4 uint64_t values)
> + * to prim_count_bo.
> + */
> +static void
> +save_prim_start_values(struct brw_context *brw,
> +                       struct brw_transform_feedback_object *obj)
> +{
> +   /* Flush any drawing so that the counters have the right values. */
> +   brw_emit_mi_flush(brw);
> +
> +   /* Emit MI_STORE_REGISTER_MEM commands to write the values. */
> +   for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
> +      brw_store_register_mem64(brw, obj->prim_count_bo,
> +                               GEN7_SO_NUM_PRIMS_WRITTEN(i),
> +                               START_OFFSET + i * sizeof(uint64_t));
> +   }
> +}
> +
> +/**
> + * Compute the number of primitives written during our most recent
> + * transform feedback activity (the current SO_NUM_PRIMS_WRITTEN value
> + * minus the stashed "start" value), and add it to our running tally.
> + *
> + * If \p finalize is true, also compute the number of vertices written
> + * (by multiplying by the number of vertices per primitive), and store
> + * that to the "final" location.
> + *
> + * Otherwise, just overwrite the old tally with the new one.
> + */
> +static void
> +tally_prims_written(struct brw_context *brw,
> +                    struct brw_transform_feedback_object *obj,
> +                    bool finalize)
> +{
> +   /* Flush any drawing so that the counters have the right values. */
> +   brw_emit_mi_flush(brw);
> +
> +   for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
> +      /* GPR0 = Tally */
> +      brw_load_register_imm32(brw, HSW_CS_GPR(0) + 4, 0);
> +      brw_load_register_mem(brw, HSW_CS_GPR(0), obj->prim_count_bo,
> +                            I915_GEM_DOMAIN_INSTRUCTION,
> +                            I915_GEM_DOMAIN_INSTRUCTION,
> +                            TALLY_OFFSET + i * sizeof(uint32_t));
> +      /* GPR1 = Start Snapshot */
> +      brw_load_register_mem64(brw, HSW_CS_GPR(1), obj->prim_count_bo,
> +                              I915_GEM_DOMAIN_INSTRUCTION,
> +                              I915_GEM_DOMAIN_INSTRUCTION,
> +                              START_OFFSET + i * sizeof(uint64_t));
> +      /* GPR2 = Ending Snapshot */
> +      brw_load_register_reg64(brw, GEN7_SO_NUM_PRIMS_WRITTEN(i), HSW_CS_GPR(2));
> +
> +      BEGIN_BATCH(9);
> +      OUT_BATCH(HSW_MI_MATH | (9 - 2));
> +      /* GPR1 = GPR2 (End) - GPR1 (Start) */
> +      OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R2));
> +      OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
> +      OUT_BATCH(MI_MATH_ALU0(SUB));
> +      OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU));
> +      /* GPR0 = GPR0 (Tally) + GPR1 (Diff) */
> +      OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
> +      OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
> +      OUT_BATCH(MI_MATH_ALU0(ADD));
> +      OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
> +      ADVANCE_BATCH();
> +
> +      if (!finalize) {
> +         /* Write back the new tally */
> +         brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0),
> +                                  TALLY_OFFSET + i * sizeof(uint32_t));
> +      } else {
> +         /* Convert the number of primitives to the number of vertices. */
> +         if (obj->primitive_mode != GL_POINTS) {
> +            const int verts = obj->primitive_mode == GL_TRIANGLES ? 3 : 2;
> +            BEGIN_BATCH(1 + 4 * (verts - 1));
> +            OUT_BATCH(HSW_MI_MATH | (1 + 4 * (verts - 1) - 2));
> +            for (int j = 0; j < verts - 1; j++) {
> +               OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
> +               OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0));
> +               OUT_BATCH(MI_MATH_ALU0(ADD));
> +               OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));

This bit of math doubles R0. So, for triangles, wouldn't it actually
multiply by 4, but I think you want 3?

Maybe you could stash the ACCU to R1 above as well, and then change
the sequence here to:

               OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
               OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
               OUT_BATCH(MI_MATH_ALU0(ADD));
               OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));

Other than this, for the series:
Reviewed-by: Jordan Justen <jordan.l.justen at intel.com>

Oh, but you'll still need to figure out when we can check cmd_parser
version 7 in Mesa.

If you want to land the rest of this earlier then just change patch 1
to old enable things only on gen8+. (Keep my r-b.) Then we can
re-enable for HSW once we can rely on the kernel.

Thanks for finding and fixing the load reg reg issue!

-Jordan

> +            }
> +            ADVANCE_BATCH();
> +         }
> +         /* Store it to the final result */
> +         brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0),
> +                                  i * sizeof(uint32_t));
> +      }
> +
> +   }
> +}
> +
> +/**
> + * BeginTransformFeedback() driver hook.
> + */
> +void
> +hsw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
> +                              struct gl_transform_feedback_object *obj)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   struct brw_transform_feedback_object *brw_obj =
> +      (struct brw_transform_feedback_object *) obj;
> +
> +   brw_obj->primitive_mode = mode;
> +
> +   /* Reset the SO buffer offsets to 0. */
> +   if (brw->gen >= 8) {
> +      brw_obj->zero_offsets = true;
> +   } else {
> +      BEGIN_BATCH(1 + 2 * BRW_MAX_XFB_STREAMS);
> +      OUT_BATCH(MI_LOAD_REGISTER_IMM | (1 + 2 * BRW_MAX_XFB_STREAMS - 2));
> +      for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
> +         OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
> +         OUT_BATCH(0);
> +      }
> +      ADVANCE_BATCH();
> +   }
> +
> +   /* Zero out the initial tallies */
> +   brw_store_data_imm64(brw, brw_obj->prim_count_bo, TALLY_OFFSET,     0ull);
> +   brw_store_data_imm64(brw, brw_obj->prim_count_bo, TALLY_OFFSET + 8, 0ull);
> +
> +   /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */
> +   save_prim_start_values(brw, brw_obj);
> +}
> +
> +/**
> + * PauseTransformFeedback() driver hook.
> + */
> +void
> +hsw_pause_transform_feedback(struct gl_context *ctx,
> +                              struct gl_transform_feedback_object *obj)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   struct brw_transform_feedback_object *brw_obj =
> +      (struct brw_transform_feedback_object *) obj;
> +
> +   if (brw->is_haswell) {
> +      /* Save the SOL buffer offset register values. */
> +      for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
> +         BEGIN_BATCH(3);
> +         OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
> +         OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
> +         OUT_RELOC(brw_obj->offset_bo,
> +                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
> +                   i * sizeof(uint32_t));
> +         ADVANCE_BATCH();
> +      }
> +   }
> +
> +   /* Add any primitives written to our tally */
> +   tally_prims_written(brw, brw_obj, false);
> +}
> +
> +/**
> + * ResumeTransformFeedback() driver hook.
> + */
> +void
> +hsw_resume_transform_feedback(struct gl_context *ctx,
> +                               struct gl_transform_feedback_object *obj)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   struct brw_transform_feedback_object *brw_obj =
> +      (struct brw_transform_feedback_object *) obj;
> +
> +   if (brw->is_haswell) {
> +      /* Reload the SOL buffer offset registers. */
> +      for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
> +         BEGIN_BATCH(3);
> +         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
> +         OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
> +         OUT_RELOC(brw_obj->offset_bo,
> +                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
> +                   i * sizeof(uint32_t));
> +         ADVANCE_BATCH();
> +      }
> +   }
> +
> +   /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */
> +   save_prim_start_values(brw, brw_obj);
> +}
> +
> +/**
> + * EndTransformFeedback() driver hook.
> + */
> +void
> +hsw_end_transform_feedback(struct gl_context *ctx,
> +                           struct gl_transform_feedback_object *obj)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   struct brw_transform_feedback_object *brw_obj =
> +      (struct brw_transform_feedback_object *) obj;
> +
> +   /* Add any primitives written to our tally, convert it from the number
> +    * of primitives written to the number of vertices written, and store
> +    * it in the "final" location in the buffer which DrawTransformFeedback()
> +    * will use as the vertex count.
> +    */
> +   tally_prims_written(brw, brw_obj, true);
> +}
> -- 
> 2.8.2
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev