[Mesa-dev] [PATCH 10/10] i965: Implement transform feedback query support in hardware on Gen6+.
Paul Berry
stereotype441 at gmail.com
Mon May 20 10:57:59 PDT 2013
On 17 May 2013 10:18, Kenneth Graunke <kenneth at whitecape.org> wrote:
> Now that we have hardware contexts and can use MI_STORE_REGISTER_MEM,
> we can use the GPU's pipeline statistics counters rather than going out
> of our way to count primitives in software.
>
> Aside from being simpler, this also paves the way for Geometry Shaders,
> which can output an arbitrary number of primitives on the GPU.
>
While you're giving rationales, another justification is that this will let
us use hardware primitive restart support when transform feedback and/or
primitive counting is in use.
>
> The GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN query is easy: it
> corresponds to the SO_NUM_PRIMS_WRITTEN/SO_NUM_PRIMS_WRITTEN0_IVB
> counters.
>
> The GL_PRIMITIVES_GENERATED query is trickier. Gen provides several
> statistics registers which /almost/ match the semantics required:
> - IA_PRIMITIVES_COUNT
> The number of primitives fetched by the VF or IA (input assembler).
> This undercounts when GS is enabled, as it can output many primitives.
> - GS_PRIMITIVES_COUNT
> The number of primitives output by the GS. Unfortunately, this
> doesn't increment unless the GS unit is actually enabled, and it
> usually isn't.
> - SO_PRIM_STORAGE_NEEDED*_IVB
> The amount of space needed to write primitives output by transform
> feedback. These naturally only work when transform feedback is on.
> We'd also have to add the counters for all four streams.
> - CL_INVOCATION_COUNT
> The number of primitives processed by the clipper. This doesn't work
> if the GS or SOL throw away primitives for rasterizer discard.
> However, it does increment even if the clipper is in REJECT_ALL mode.
>
> Dynamically switching between counters would be painfully complicated,
> especially since GS, rasterizer discard, and transform feedback can all
> be switched on and off repeatedly during a single query.
>
> The most usable counter is CL_INVOCATION_COUNT. The previous two
> patches reworked rasterizer discard support so that all primitives hit
> the clipper, making this work.
>
> Cc: Eric Anholt <eric at anholt.net>
> Cc: Paul Berry <stereotype441 at gmail.com>
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
> src/mesa/drivers/dri/i965/gen6_queryobj.c | 105
> +++++++++++++++++++-----------
> 1 file changed, 66 insertions(+), 39 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c
> b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> index 28af8d7..a032227 100644
> --- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
> +++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> @@ -94,6 +94,57 @@ write_depth_count(struct intel_context *intel,
> drm_intel_bo *query_bo, int idx)
> ADVANCE_BATCH();
> }
>
> +/*
> + * Write an arbitrary 64-bit register to a buffer via
> MI_STORE_REGISTER_MEM.
> + *
> + * Only TIMESTAMP and PS_DEPTH_COUNT have special PIPE_CONTROL support;
> other
> + * counters have to be read via the generic MI_STORE_REGISTER_MEM. This
> + * function also performs a pipeline flush for proper synchronization.
> + */
> +static void
> +write_reg(struct intel_context *intel,
> + drm_intel_bo *query_bo, uint32_t reg, int idx)
> +{
> + assert(intel->gen >= 6);
> +
> + intel_batchbuffer_emit_mi_flush(intel);
> +
> + /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
> + * read a full 64-bit register, we need to do two of them.
> + */
> + BEGIN_BATCH(3);
> + OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
> + OUT_BATCH(reg);
> + OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
> + idx * sizeof(uint64_t));
> + ADVANCE_BATCH();
> +
> + BEGIN_BATCH(3);
> + OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
> + OUT_BATCH(reg + sizeof(uint32_t));
> + OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
> + sizeof(uint32_t) + idx * sizeof(uint64_t));
> + ADVANCE_BATCH();
> +}
> +
> +static void
> +write_primitives_generated(struct intel_context *intel,
> + drm_intel_bo *query_bo, int idx)
> +{
> + write_reg(intel, query_bo, CL_INVOCATION_COUNT, idx);
> +}
> +
> +static void
> +write_xfb_primitives_written(struct intel_context *intel,
> + drm_intel_bo *query_bo, int idx)
> +{
> + if (intel->gen >= 7) {
> + write_reg(intel, query_bo, SO_NUM_PRIMS_WRITTEN0_IVB, idx);
> + } else {
> + write_reg(intel, query_bo, SO_NUM_PRIMS_WRITTEN, idx);
> + }
> +}
> +
> /**
> * Wait on the query object's BO and calculate the final result.
> */
> @@ -152,21 +203,20 @@ gen6_queryobj_get_results(struct gl_context *ctx,
> query->Base.Result &= (1ull << 36) - 1;
> break;
>
> - case GL_SAMPLES_PASSED_ARB:
> - query->Base.Result += results[1] - results[0];
> - break;
> -
> case GL_ANY_SAMPLES_PASSED:
> case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
> - query->Base.Result = results[0] != results[1];
> + if (results[0] != results[1])
> + query->Base.Result = true;
>
This looks like it got squashed into the wrong patch of the series.
With that fixed, this patch is:
Reviewed-by: Paul Berry <stereotype441 at gmail.com>
> break;
>
> + case GL_SAMPLES_PASSED_ARB:
> case GL_PRIMITIVES_GENERATED:
> case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
> - /* We don't actually query the hardware for this value, so query->bo
> - * should always be NULL and execution should never reach here.
> + /* We need to use += rather than = here since some BLT-based
> operations
> + * may have added additional samples to our occlusion query value.
> + * It shouldn't matter for geometry queries, but is harmless.
> */
> - assert(!"Unreachable");
> + query->Base.Result += results[1] - results[0];
> break;
>
> default:
> @@ -191,10 +241,13 @@ gen6_queryobj_get_results(struct gl_context *ctx,
> static void
> gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
> {
> - struct brw_context *brw = brw_context(ctx);
> struct intel_context *intel = intel_context(ctx);
> struct brw_query_object *query = (struct brw_query_object *)q;
>
> + /* Since we're starting a new query, we need to throw away old
> results. */
> + drm_intel_bo_unreference(query->bo);
> + query->bo = drm_intel_bo_alloc(intel->bufmgr, "query results", 4096,
> 4096);
> +
> switch (query->Base.Target) {
> case GL_TIME_ELAPSED:
> /* For timestamp queries, we record the starting time right away so
> that
> @@ -216,36 +269,21 @@ gen6_begin_query(struct gl_context *ctx, struct
> gl_query_object *q)
> * obtain the time elapsed. Notably, this includes time elapsed
> while
> * the system was doing other work, such as running other
> applications.
> */
> - drm_intel_bo_unreference(query->bo);
> - query->bo = drm_intel_bo_alloc(intel->bufmgr, "timer query", 4096,
> 4096);
> write_timestamp(intel, query->bo, 0);
> break;
>
> case GL_ANY_SAMPLES_PASSED:
> case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
> case GL_SAMPLES_PASSED_ARB:
> - /* Since we're starting a new query, we need to be sure to throw
> away
> - * any previous occlusion query results.
> - */
> - drm_intel_bo_unreference(query->bo);
> - query->bo = drm_intel_bo_alloc(intel->bufmgr, "occl. query", 4096,
> 4096);
> write_depth_count(intel, query->bo, 0);
> break;
>
> case GL_PRIMITIVES_GENERATED:
> - /* We don't actually query the hardware for this value; we keep
> track of
> - * it a software counter. So just reset the counter.
> - */
> - brw->sol.primitives_generated = 0;
> - brw->sol.counting_primitives_generated = true;
> + write_primitives_generated(intel, query->bo, 0);
> break;
>
> case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
> - /* We don't actually query the hardware for this value; we keep
> track of
> - * it a software counter. So just reset the counter.
> - */
> - brw->sol.primitives_written = 0;
> - brw->sol.counting_primitives_written = true;
> + write_xfb_primitives_written(intel, query->bo, 0);
> break;
>
> default:
> @@ -265,7 +303,6 @@ gen6_begin_query(struct gl_context *ctx, struct
> gl_query_object *q)
> static void
> gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
> {
> - struct brw_context *brw = brw_context(ctx);
> struct intel_context *intel = intel_context(ctx);
> struct brw_query_object *query = (struct brw_query_object *)q;
>
> @@ -281,21 +318,11 @@ gen6_end_query(struct gl_context *ctx, struct
> gl_query_object *q)
> break;
>
> case GL_PRIMITIVES_GENERATED:
> - /* We don't actually query the hardware for this value; we keep
> track of
> - * it in a software counter. So just read the counter and store it
> in
> - * the query object.
> - */
> - query->Base.Result = brw->sol.primitives_generated;
> - brw->sol.counting_primitives_generated = false;
> + write_primitives_generated(intel, query->bo, 1);
> break;
>
> case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
> - /* We don't actually query the hardware for this value; we keep
> track of
> - * it in a software counter. So just read the counter and store it
> in
> - * the query object.
> - */
> - query->Base.Result = brw->sol.primitives_written;
> - brw->sol.counting_primitives_written = false;
> + write_xfb_primitives_written(intel, query->bo, 1);
> break;
>
> default:
> --
> 1.8.2.3
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20130520/c53c50e4/attachment-0001.html>
More information about the mesa-dev
mailing list