[Mesa-dev] [PATCH 3/3] i965: Reimplement ARB_transform_feedback2 on Haswell and later.
Jordan Justen
jordan.l.justen at intel.com
Sat May 7 08:12:29 UTC 2016
On 2016-05-06 23:42:49, Kenneth Graunke wrote:
> My old implementation accumulated <start, end> pairs in a buffer,
> and eventually processed that data on the CPU. This meant flushing
> the batchbuffer and waiting for it to completely execute before we
> could map it, resulting in really long stalls. We could also run out
> of space in the buffer, and have to do this early.
>
> Instead, we can use Haswell's MI_MATH command to do the (end - start)
> subtraction, as well as the multiplication by 2 or 3 to convert from
> the number of primitives written to the number of vertices written.
> We still need to CS stall to read the counters, but otherwise everything
> is completely pipelined - there's no CPU<->GPU synchronization required.
> It also uses only 80 bytes in the buffer, no matter what.
>
> Improves performance in Manhattan on Skylake GT3e at 800x600 by
> 6.1086% +/- 0.954166% (n=9). At 1920x1080, improves performance
> by 2.82103% +/- 0.148596% (n=84).
>
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
> src/mesa/drivers/dri/i965/Makefile.sources | 1 +
> src/mesa/drivers/dri/i965/brw_context.c | 14 +-
> src/mesa/drivers/dri/i965/brw_context.h | 14 ++
> src/mesa/drivers/dri/i965/brw_draw.c | 38 ++++-
> src/mesa/drivers/dri/i965/hsw_sol.c | 253 +++++++++++++++++++++++++++++
> 5 files changed, 308 insertions(+), 12 deletions(-)
> create mode 100644 src/mesa/drivers/dri/i965/hsw_sol.c
>
> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
> index 8c60954..d35775e 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -228,6 +228,7 @@ i965_FILES = \
> gen8_vs_state.c \
> gen8_wm_depth_stencil.c \
> hsw_queryobj.c \
> + hsw_sol.c \
> intel_batchbuffer.c \
> intel_batchbuffer.h \
> intel_blit.c \
> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
> index 1380d41..2338514 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.c
> +++ b/src/mesa/drivers/dri/i965/brw_context.c
> @@ -372,13 +372,18 @@ brw_init_driver_functions(struct brw_context *brw,
>
> functions->NewTransformFeedback = brw_new_transform_feedback;
> functions->DeleteTransformFeedback = brw_delete_transform_feedback;
> - functions->GetTransformFeedbackVertexCount =
> - brw_get_transform_feedback_vertex_count;
> - if (brw->gen >= 7) {
> + if (brw->intelScreen->has_mi_math) {
> + functions->BeginTransformFeedback = hsw_begin_transform_feedback;
> + functions->EndTransformFeedback = hsw_end_transform_feedback;
> + functions->PauseTransformFeedback = hsw_pause_transform_feedback;
> + functions->ResumeTransformFeedback = hsw_resume_transform_feedback;
> + } else if (brw->gen >= 7) {
> functions->BeginTransformFeedback = gen7_begin_transform_feedback;
> functions->EndTransformFeedback = gen7_end_transform_feedback;
> functions->PauseTransformFeedback = gen7_pause_transform_feedback;
> functions->ResumeTransformFeedback = gen7_resume_transform_feedback;
> + functions->GetTransformFeedbackVertexCount =
> + brw_get_transform_feedback_vertex_count;
> } else {
> functions->BeginTransformFeedback = brw_begin_transform_feedback;
> functions->EndTransformFeedback = brw_end_transform_feedback;
> @@ -494,7 +499,8 @@ brw_initialize_context_constants(struct brw_context *brw)
> ctx->Const.MaxTransformFeedbackSeparateComponents =
> BRW_MAX_SOL_BINDINGS / BRW_MAX_SOL_BUFFERS;
>
> - ctx->Const.AlwaysUseGetTransformFeedbackVertexCount = true;
> + ctx->Const.AlwaysUseGetTransformFeedbackVertexCount =
> + !brw->intelScreen->has_mi_math;
>
> int max_samples;
> const int *msaa_modes = intel_supported_msaa_modes(brw->intelScreen);
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index b620f14..035cbe9 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1656,6 +1656,20 @@ void
> gen7_resume_transform_feedback(struct gl_context *ctx,
> struct gl_transform_feedback_object *obj);
>
> +/* hsw_sol.c */
> +void
> +hsw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
> + struct gl_transform_feedback_object *obj);
> +void
> +hsw_end_transform_feedback(struct gl_context *ctx,
> + struct gl_transform_feedback_object *obj);
> +void
> +hsw_pause_transform_feedback(struct gl_context *ctx,
> + struct gl_transform_feedback_object *obj);
> +void
> +hsw_resume_transform_feedback(struct gl_context *ctx,
> + struct gl_transform_feedback_object *obj);
> +
> /* brw_blorp_blit.cpp */
> GLbitfield
> brw_blorp_framebuffer(struct brw_context *brw,
> diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
> index afa8a4e..9d034cf 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw.c
> @@ -153,7 +153,9 @@ trim(GLenum prim, GLuint length)
> static void
> brw_emit_prim(struct brw_context *brw,
> const struct _mesa_prim *prim,
> - uint32_t hw_prim)
> + uint32_t hw_prim,
> + struct brw_transform_feedback_object *xfb_obj,
> + unsigned stream)
> {
> int verts_per_instance;
> int vertex_access_type;
> @@ -185,7 +187,7 @@ brw_emit_prim(struct brw_context *brw,
> verts_per_instance = prim->count;
>
> /* If nothing to emit, just return. */
> - if (verts_per_instance == 0 && !prim->is_indirect)
> + if (verts_per_instance == 0 && !prim->is_indirect && !xfb_obj)
> return;
>
> /* If we're set to always flush, do it before and after the primitive emit.
> @@ -197,7 +199,25 @@ brw_emit_prim(struct brw_context *brw,
> brw_emit_mi_flush(brw);
>
> /* If indirect, emit a bunch of loads from the indirect BO. */
> - if (prim->is_indirect) {
> + if (xfb_obj) {
> + indirect_flag = GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE;
> +
> + brw_load_register_mem(brw, GEN7_3DPRIM_VERTEX_COUNT,
> + xfb_obj->prim_count_bo,
> + I915_GEM_DOMAIN_VERTEX, 0,
> + stream * sizeof(uint32_t));
> + BEGIN_BATCH(9);
> + OUT_BATCH(MI_LOAD_REGISTER_IMM | (9 - 2));
> + OUT_BATCH(GEN7_3DPRIM_INSTANCE_COUNT);
> + OUT_BATCH(prim->num_instances);
> + OUT_BATCH(GEN7_3DPRIM_START_VERTEX);
> + OUT_BATCH(0);
> + OUT_BATCH(GEN7_3DPRIM_BASE_VERTEX);
> + OUT_BATCH(0);
> + OUT_BATCH(GEN7_3DPRIM_START_INSTANCE);
> + OUT_BATCH(0);
> + ADVANCE_BATCH();
> + } else if (prim->is_indirect) {
> struct gl_buffer_object *indirect_buffer = brw->ctx.DrawIndirectBuffer;
> drm_intel_bo *bo = intel_bufferobj_buffer(brw,
> intel_buffer_object(indirect_buffer),
> @@ -382,6 +402,8 @@ brw_try_draw_prims(struct gl_context *ctx,
> const struct _mesa_index_buffer *ib,
> GLuint min_index,
> GLuint max_index,
> + struct brw_transform_feedback_object *xfb_obj,
> + unsigned stream,
> struct gl_buffer_object *indirect)
> {
> struct brw_context *brw = brw_context(ctx);
> @@ -531,7 +553,7 @@ retry:
> brw_upload_render_state(brw);
> }
>
> - brw_emit_prim(brw, &prims[i], brw->primitive);
> + brw_emit_prim(brw, &prims[i], brw->primitive, xfb_obj, stream);
>
> brw->no_batch_wrap = false;
>
> @@ -573,14 +595,14 @@ brw_draw_prims(struct gl_context *ctx,
> GLboolean index_bounds_valid,
> GLuint min_index,
> GLuint max_index,
> - struct gl_transform_feedback_object *unused_tfb_object,
> + struct gl_transform_feedback_object *gl_xfb_obj,
> unsigned stream,
> struct gl_buffer_object *indirect)
> {
> struct brw_context *brw = brw_context(ctx);
> const struct gl_client_array **arrays = ctx->Array._DrawArrays;
> -
> - assert(unused_tfb_object == NULL);
> + struct brw_transform_feedback_object *xfb_obj =
> + (struct brw_transform_feedback_object *) gl_xfb_obj;
>
> if (!brw_check_conditional_render(brw))
> return;
> @@ -619,7 +641,7 @@ brw_draw_prims(struct gl_context *ctx,
> * to it.
> */
> brw_try_draw_prims(ctx, arrays, prims, nr_prims, ib, min_index, max_index,
> - indirect);
> + xfb_obj, stream, indirect);
> }
>
> void
> diff --git a/src/mesa/drivers/dri/i965/hsw_sol.c b/src/mesa/drivers/dri/i965/hsw_sol.c
> new file mode 100644
> index 0000000..657f863
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/hsw_sol.c
> @@ -0,0 +1,253 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +/**
> + * An implementation of the transform feedback driver hooks for Haswell
> + * and later hardware. This uses MI_MATH to compute the number of vertices
> + * written (for use by DrawTransformFeedback()) without any CPU<->GPU
> + * synchronization which could stall.
> + */
> +
> +#include "brw_context.h"
> +#include "brw_state.h"
> +#include "brw_defines.h"
> +#include "intel_batchbuffer.h"
> +#include "intel_buffer_objects.h"
> +#include "main/transformfeedback.h"
> +
> +/**
> + * We store several values in obj->prim_count_bo:
> + *
> + * [4x 32-bit values]: Final Number of Vertices Written
> + * [4x 32-bit values]: Tally of Primitives Written So Far
> + * [4x 64-bit values]: Starting SO_NUM_PRIMS_WRITTEN Counter Snapshots
> + *
> + * The first set of values is used by DrawTransformFeedback(), which
> + * copies one of them into the 3DPRIM_VERTEX_COUNT register and performs
> + * an indirect draw. The other values are just temporary storage.
> + */
> +
> +#define TALLY_OFFSET (BRW_MAX_XFB_STREAMS * sizeof(uint32_t))
> +#define START_OFFSET (TALLY_OFFSET * 2)
> +
> +/**
> + * Store the SO_NUM_PRIMS_WRITTEN counters for each stream (4 uint64_t values)
> + * to prim_count_bo.
> + */
> +static void
> +save_prim_start_values(struct brw_context *brw,
> + struct brw_transform_feedback_object *obj)
> +{
> + /* Flush any drawing so that the counters have the right values. */
> + brw_emit_mi_flush(brw);
> +
> + /* Emit MI_STORE_REGISTER_MEM commands to write the values. */
> + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
> + brw_store_register_mem64(brw, obj->prim_count_bo,
> + GEN7_SO_NUM_PRIMS_WRITTEN(i),
> + START_OFFSET + i * sizeof(uint64_t));
> + }
> +}
> +
> +/**
> + * Compute the number of primitives written during our most recent
> + * transform feedback activity (the current SO_NUM_PRIMS_WRITTEN value
> + * minus the stashed "start" value), and add it to our running tally.
> + *
> + * If \p finalize is true, also compute the number of vertices written
> + * (by multiplying by the number of vertices per primitive), and store
> + * that to the "final" location.
> + *
> + * Otherwise, just overwrite the old tally with the new one.
> + */
> +static void
> +tally_prims_written(struct brw_context *brw,
> + struct brw_transform_feedback_object *obj,
> + bool finalize)
> +{
> + /* Flush any drawing so that the counters have the right values. */
> + brw_emit_mi_flush(brw);
> +
> + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
> + /* GPR0 = Tally */
> + brw_load_register_imm32(brw, HSW_CS_GPR(0) + 4, 0);
> + brw_load_register_mem(brw, HSW_CS_GPR(0), obj->prim_count_bo,
> + I915_GEM_DOMAIN_INSTRUCTION,
> + I915_GEM_DOMAIN_INSTRUCTION,
> + TALLY_OFFSET + i * sizeof(uint32_t));
> + /* GPR1 = Start Snapshot */
> + brw_load_register_mem64(brw, HSW_CS_GPR(1), obj->prim_count_bo,
> + I915_GEM_DOMAIN_INSTRUCTION,
> + I915_GEM_DOMAIN_INSTRUCTION,
> + START_OFFSET + i * sizeof(uint64_t));
> + /* GPR2 = Ending Snapshot */
> + brw_load_register_reg64(brw, GEN7_SO_NUM_PRIMS_WRITTEN(i), HSW_CS_GPR(2));
> +
> + BEGIN_BATCH(9);
> + OUT_BATCH(HSW_MI_MATH | (9 - 2));
> + /* GPR1 = GPR2 (End) - GPR1 (Start) */
> + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R2));
> + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
> + OUT_BATCH(MI_MATH_ALU0(SUB));
> + OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU));
> + /* GPR0 = GPR0 (Tally) + GPR1 (Diff) */
> + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
> + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
> + OUT_BATCH(MI_MATH_ALU0(ADD));
> + OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
> + ADVANCE_BATCH();
> +
> + if (!finalize) {
> + /* Write back the new tally */
> + brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0),
> + TALLY_OFFSET + i * sizeof(uint32_t));
> + } else {
> + /* Convert the number of primitives to the number of vertices. */
> + if (obj->primitive_mode != GL_POINTS) {
> + const int verts = obj->primitive_mode == GL_TRIANGLES ? 3 : 2;
> + BEGIN_BATCH(1 + 4 * (verts - 1));
> + OUT_BATCH(HSW_MI_MATH | (1 + 4 * (verts - 1) - 2));
> + for (int j = 0; j < verts - 1; j++) {
> + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
> + OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0));
> + OUT_BATCH(MI_MATH_ALU0(ADD));
> + OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
This bit of math doubles R0. So, for triangles, wouldn't it actually
multiply by 4, but I think you want 3?
Maybe you could stash the ACCU to R1 above as well, and then change
the sequence here to:
OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
OUT_BATCH(MI_MATH_ALU0(ADD));
OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
Other than this, for the series:
Reviewed-by: Jordan Justen <jordan.l.justen at intel.com>
Oh, but you'll still need to figure out when we can check cmd_parser
version 7 in Mesa.
If you want to land the rest of this earlier then just change patch 1
to old enable things only on gen8+. (Keep my r-b.) Then we can
re-enable for HSW once we can rely on the kernel.
Thanks for finding and fixing the load reg reg issue!
-Jordan
> + }
> + ADVANCE_BATCH();
> + }
> + /* Store it to the final result */
> + brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0),
> + i * sizeof(uint32_t));
> + }
> +
> + }
> +}
> +
> +/**
> + * BeginTransformFeedback() driver hook.
> + */
> +void
> +hsw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
> + struct gl_transform_feedback_object *obj)
> +{
> + struct brw_context *brw = brw_context(ctx);
> + struct brw_transform_feedback_object *brw_obj =
> + (struct brw_transform_feedback_object *) obj;
> +
> + brw_obj->primitive_mode = mode;
> +
> + /* Reset the SO buffer offsets to 0. */
> + if (brw->gen >= 8) {
> + brw_obj->zero_offsets = true;
> + } else {
> + BEGIN_BATCH(1 + 2 * BRW_MAX_XFB_STREAMS);
> + OUT_BATCH(MI_LOAD_REGISTER_IMM | (1 + 2 * BRW_MAX_XFB_STREAMS - 2));
> + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
> + OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
> + OUT_BATCH(0);
> + }
> + ADVANCE_BATCH();
> + }
> +
> + /* Zero out the initial tallies */
> + brw_store_data_imm64(brw, brw_obj->prim_count_bo, TALLY_OFFSET, 0ull);
> + brw_store_data_imm64(brw, brw_obj->prim_count_bo, TALLY_OFFSET + 8, 0ull);
> +
> + /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */
> + save_prim_start_values(brw, brw_obj);
> +}
> +
> +/**
> + * PauseTransformFeedback() driver hook.
> + */
> +void
> +hsw_pause_transform_feedback(struct gl_context *ctx,
> + struct gl_transform_feedback_object *obj)
> +{
> + struct brw_context *brw = brw_context(ctx);
> + struct brw_transform_feedback_object *brw_obj =
> + (struct brw_transform_feedback_object *) obj;
> +
> + if (brw->is_haswell) {
> + /* Save the SOL buffer offset register values. */
> + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
> + BEGIN_BATCH(3);
> + OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
> + OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
> + OUT_RELOC(brw_obj->offset_bo,
> + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
> + i * sizeof(uint32_t));
> + ADVANCE_BATCH();
> + }
> + }
> +
> + /* Add any primitives written to our tally */
> + tally_prims_written(brw, brw_obj, false);
> +}
> +
> +/**
> + * ResumeTransformFeedback() driver hook.
> + */
> +void
> +hsw_resume_transform_feedback(struct gl_context *ctx,
> + struct gl_transform_feedback_object *obj)
> +{
> + struct brw_context *brw = brw_context(ctx);
> + struct brw_transform_feedback_object *brw_obj =
> + (struct brw_transform_feedback_object *) obj;
> +
> + if (brw->is_haswell) {
> + /* Reload the SOL buffer offset registers. */
> + for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
> + BEGIN_BATCH(3);
> + OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
> + OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
> + OUT_RELOC(brw_obj->offset_bo,
> + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
> + i * sizeof(uint32_t));
> + ADVANCE_BATCH();
> + }
> + }
> +
> + /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */
> + save_prim_start_values(brw, brw_obj);
> +}
> +
> +/**
> + * EndTransformFeedback() driver hook.
> + */
> +void
> +hsw_end_transform_feedback(struct gl_context *ctx,
> + struct gl_transform_feedback_object *obj)
> +{
> + struct brw_context *brw = brw_context(ctx);
> + struct brw_transform_feedback_object *brw_obj =
> + (struct brw_transform_feedback_object *) obj;
> +
> + /* Add any primitives written to our tally, convert it from the number
> + * of primitives written to the number of vertices written, and store
> + * it in the "final" location in the buffer which DrawTransformFeedback()
> + * will use as the vertex count.
> + */
> + tally_prims_written(brw, brw_obj, true);
> +}
> --
> 2.8.2
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list