[Mesa-dev] [PATCH 3/3] broadcom/vc4: Add support for HW perfmon

Fri Dec 22 23:58:19 UTC 2017

Boris Brezillon <boris.brezillon at free-electrons.com> writes:

> The V3D engine provides several perf counters.
> Implement ->get_driver_query_[group_]info() so that these counters are
> exposed through the GL_AMD_performance_monitor extension.

Thanks for working on this!  I've successfully used it to inform some
work I'm doing on 3DMMES.

> Signed-off-by: Boris Brezillon <boris.brezillon at free-electrons.com>
> ---
>  src/gallium/drivers/vc4/vc4_context.h |  13 +++
>  src/gallium/drivers/vc4/vc4_job.c     |   9 +-
>  src/gallium/drivers/vc4/vc4_query.c   | 197 ++++++++++++++++++++++++++++++++--
>  src/gallium/drivers/vc4/vc4_screen.c  |   7 ++
>  src/gallium/drivers/vc4/vc4_screen.h  |   1 +
>  5 files changed, 215 insertions(+), 12 deletions(-)
>
> diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
> index 4a1e4093f1a0..b6d9f041efc7 100644
> --- a/src/gallium/drivers/vc4/vc4_context.h
> +++ b/src/gallium/drivers/vc4/vc4_context.h
> @@ -309,6 +309,11 @@ struct vc4_job {
>          struct vc4_job_key key;
>  };
>  
> +struct vc4_hwperfmon {
> +        uint32_t id;
> +        uint64_t counters[DRM_VC4_MAX_PERF_COUNTERS];
> +};
> +
>  struct vc4_context {
>          struct pipe_context base;
>  
> @@ -387,6 +392,8 @@ struct vc4_context {
>          struct pipe_viewport_state viewport;
>          struct vc4_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
>          struct vc4_vertexbuf_stateobj vertexbuf;
> +
> +        struct vc4_hwperfmon *perfmon;
>          /** @} */
>  };
>  
> @@ -444,6 +451,12 @@ vc4_sampler_state(struct pipe_sampler_state *psampler)
>          return (struct vc4_sampler_state *)psampler;
>  }
>  
> +int vc4_get_driver_query_group_info(struct pipe_screen *pscreen,
> +                                    unsigned index,
> +                                    struct pipe_driver_query_group_info *info);
> +int vc4_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
> +                              struct pipe_driver_query_info *info);
> +
>  struct pipe_context *vc4_context_create(struct pipe_screen *pscreen,
>                                          void *priv, unsigned flags);
>  void vc4_draw_init(struct pipe_context *pctx);
> diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
> index fb0c5bbc78cf..f75a32565603 100644
> --- a/src/gallium/drivers/vc4/vc4_job.c
> +++ b/src/gallium/drivers/vc4/vc4_job.c
> @@ -362,7 +362,7 @@ vc4_submit_setup_rcl_msaa_surface(struct vc4_job *job,
>          rsc->writes++;
>  }
>  
> -#define MAX_CHUNKS		1
> +#define MAX_CHUNKS		2
>  
>  /**
>   * Submits the job to the kernel and then reinitializes it.
> @@ -467,6 +467,13 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job)
>          submit.uniforms = (uintptr_t)job->uniforms.base;
>          submit.uniforms_size = cl_offset(&job->uniforms);
>  
> +        if (vc4->perfmon && screen->has_extended_cl) {
> +                chunks[nchunks].perfmon.type = VC4_PERFMON_CHUNK;
> +                chunks[nchunks].perfmon.id = vc4->perfmon->id;
> +                chunks[nchunks].perfmon.pad = 0;
> +                nchunks++;
> +        }
> +
>          if (nchunks) {
>                  submit.flags |= VC4_SUBMIT_CL_EXTENDED;
>                  submit.cl_chunks = (uintptr_t)chunks;
> diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c
> index ddf8f8fb0c2c..d6b081bb15d7 100644
> --- a/src/gallium/drivers/vc4/vc4_query.c
> +++ b/src/gallium/drivers/vc4/vc4_query.c
> @@ -32,49 +32,224 @@
>  
>  struct vc4_query
>  {
> -        uint8_t pad;
> +        unsigned num_queries;
> +        struct vc4_hwperfmon *hwperfmon;
>  };
>  
> +static const char *v3d_counter_names[] = {
> +        "FEP-valid-primitives-no-rendered-pixels",
> +        "FEP-valid-primitives-rendered-pixels",
> +        "FEP-clipped-quads",
> +        "FEP-valid-quads",
> +        "TLB-quads-not-passing-stencil-test",
> +        "TLB-quads-not-passing-z-and-stencil-test",

Looks like you missed "TLB-quads-passing-z-and-stencil-test" here.

> +        "TLB-quads-with-zero-coverage",
> +        "TLB-quads-with-non-zero-coverage",
> +        "TLB-quads-written-to-color-buffer",
> +        "PTB-primitives-discarded-outside-viewport",
> +        "PTB-primitives-need-clipping",
> +        "PTB-primitives-discared-reversed",
> +        "QPU-total-idle-clk-cycles",
> +        "QPU-total-clk-cycles-vertex-coord-shading",
> +        "QPU-total-clk-cycles-fragment-shading",
> +        "QPU-total-clk-cycles-executing-valid-instr",
> +        "QPU-total-clk-cycles-waiting-TMU",
> +        "QPU-total-clk-cycles-waiting-scoreboard",
> +        "QPU-total-clk-cycles-waiting-varyings",
> +        "QPU-total-instr-cache-hit",
> +        "QPU-total-instr-cache-miss",
> +        "QPU-total-uniform-cache-hit",
> +        "QPU-total-uniform-cache-miss",
> +        "TMU-total-text-quads-processed",
> +        "TMU-total-text-cache-miss",
> +        "VPM-total-clk-cycles-VDW-stalled",
> +        "VPM-total-clk-cycles-VCD-stalled",
> +        "L2C-total-cache-hit",
> +        "L2C-total-cache-miss",
> +};

It would be great to build some piglit tests if we could.  Some easy
ones I can think of:

- Make sure that rendering a bunch of prims gets us
  FEP-valid-primitives-rendered pixels.  (note that it may exceed the
  number of prims in the draw, when they cross a tile boundary).

- Make sure that rendering prims before starting our query or after
  ending our query doesn't increment the prims counters.

- Make sure that an unscissored glClear(COLOR|DEPTH) spends 0 time in
  VS/FS, but make sure that drawing does.  (I think this would have
  caught the missing counter)

- Make sure that TMU-total-text-quads-processed is incremented for
  texturing but not shaders that don't do texturing.

> +int vc4_get_driver_query_group_info(struct pipe_screen *pscreen,
> +                                    unsigned index,
> +                                    struct pipe_driver_query_group_info *info)
> +{
> +        struct vc4_screen *screen = vc4_screen(pscreen);
> +
> +        if (!screen->has_perfmon_ioctl)
> +                return 0;
> +
> +        if (!info)
> +                return 1;
> +
> +        if (index > 0)
> +                return 0;
> +
> +        info->name = "V3D counters";
> +        info->max_active_queries = DRM_VC4_MAX_PERF_COUNTERS;
> +        info->num_queries = ARRAY_SIZE(v3d_counter_names);
> +        return 1;
> +}
> +
> +int vc4_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
> +                              struct pipe_driver_query_info *info)
> +{
> +        struct vc4_screen *screen = vc4_screen(pscreen);
> +
> +        if (!screen->has_perfmon_ioctl)
> +                return 0;
> +
> +        if (!info)
> +                return ARRAY_SIZE(v3d_counter_names);
> +
> +        if (index >= ARRAY_SIZE(v3d_counter_names))
> +                return 0;
> +
> +        info->name = v3d_counter_names[index];
> +        info->query_type = PIPE_QUERY_DRIVER_SPECIFIC + index;
> +        info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
> +        info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
> +        info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;

Missing initialization of info->group

> +        return 1;
> +}
> +
>  static struct pipe_query *
> -vc4_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
> +vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
> +                       unsigned *query_types)
>  {
> +        struct vc4_context *ctx = vc4_context(pctx);
>          struct vc4_query *query = calloc(1, sizeof(*query));
> +        struct drm_vc4_perfmon_create req;
> +        struct vc4_hwperfmon *hwperfmon;
> +        unsigned i, nhwqueries = 0;
> +        int ret;
> +
> +        if (!query)
> +                return NULL;
> +
> +        for (i = 0; i < num_queries; i++) {
> +                if (query_types[i] >= PIPE_QUERY_DRIVER_SPECIFIC)
> +                        nhwqueries++;
> +        }
> +
> +        /* We can't mix HW and non-HW queries. */
> +        if (nhwqueries && nhwqueries != num_queries)
> +                return NULL;
> +
> +        if (!nhwqueries)
> +                return (struct pipe_query *)query;
> +
> +        hwperfmon = calloc(1, sizeof(*hwperfmon));
> +        if (!hwperfmon)
> +                goto err_free_query;
> +
> +        for (i = 0; i < num_queries; i++)
> +                req.events[i] = query_types[i] - PIPE_QUERY_DRIVER_SPECIFIC;
> +
> +        req.ncounters = num_queries;
> +        ret = vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_CREATE, &req);
> +        if (ret)
> +                goto err_free_hwperfmon;
> +
> +        hwperfmon->id = req.id;
> +        query->hwperfmon = hwperfmon;
> +        query->num_queries = num_queries;
>  
>          /* Note that struct pipe_query isn't actually defined anywhere. */
>          return (struct pipe_query *)query;
> +
> +err_free_hwperfmon:
> +        free(hwperfmon);
> +
> +err_free_query:
> +        free(query);
> +
> +        return NULL;
> +}
> +
> +static struct pipe_query *
> +vc4_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
> +{
> +        return vc4_create_batch_query(ctx, 1, &query_type);
>  }
>  
>  static void
> -vc4_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
> +vc4_destroy_query(struct pipe_context *pctx, struct pipe_query *pquery)
>  {
> +        struct vc4_context *ctx = vc4_context(pctx);
> +        struct vc4_query *query = (struct vc4_query *)pquery;
> +
> +        if (query->hwperfmon) {
> +                struct drm_vc4_perfmon_destroy req;
> +
> +                req.id = query->hwperfmon->id;
> +                vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_DESTROY, &req);
> +                free(query->hwperfmon);
> +        }
> +
>          free(query);
>  }
>  
>  static boolean
> -vc4_begin_query(struct pipe_context *ctx, struct pipe_query *query)
> +vc4_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
>  {
> +        struct vc4_query *query = (struct vc4_query *)pquery;
> +        struct vc4_context *ctx = vc4_context(pctx);
> +
> +        if (!query->hwperfmon)
> +                return true;
> +
> +        /* Only one perfmon can be activated per context. */
> +        if (ctx->perfmon)
> +                return false;
> +
> +        ctx->perfmon = query->hwperfmon;

We need to vc4_flush() before changing ctx->perfmon here and in end,
because you don't want things before the start or after the end to be
counted.

>          return true;
>  }
>  
>  static bool
> -vc4_end_query(struct pipe_context *ctx, struct pipe_query *query)
> +vc4_end_query(struct pipe_context *pctx, struct pipe_query *pquery)
>  {
> +        struct vc4_query *query = (struct vc4_query *)pquery;
> +        struct vc4_context *ctx = vc4_context(pctx);
> +
> +        if (!query->hwperfmon)
> +                return true;
> +
> +        if (ctx->perfmon != query->hwperfmon)
> +                return false;
> +
> +        ctx->perfmon = NULL;
>          return true;
>  }
>  
>  static boolean
> -vc4_get_query_result(struct pipe_context *ctx, struct pipe_query *query,
> +vc4_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
>                       boolean wait, union pipe_query_result *vresult)
>  {
> -        uint64_t *result = &vresult->u64;
> +        struct vc4_context *ctx = vc4_context(pctx);
> +        struct vc4_query *query = (struct vc4_query *)pquery;
> +        struct drm_vc4_perfmon_get_values req;
> +        unsigned i;
> +        int ret;
> +
> +        if (!query->hwperfmon) {
> +                vresult->u64 = 0;
> +                return true;
> +	}
>  
> -        *result = 0;
> +        req.id = query->hwperfmon->id;
> +        req.values_ptr = (uintptr_t)query->hwperfmon->counters;
> +        ret = vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_GET_VALUES, &req);
> +        if (ret)
> +                return false;

Something needs to block before handing us back the results here, or you
may just get 0s since the job hasn't completed yet.  I think that
blocking should be the kernel's responsibility.

> +
> +        for (i = 0; i < query->num_queries; i++)
> +                vresult[i].u64 = query->hwperfmon->counters[i];

You'll also need to capture the values at vc4_begin_query() and return
this minus the begin values, or reuse of counter objects will end up
continually increasing the values returned from the queries.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 832 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20171222/6335d13b/attachment.sig>