[Mesa-dev] [PATCH v2 2/2] broadcom/vc4: Add support for HW perfmon

Thu Jan 11 09:22:04 UTC 2018

The V3D engine provides several perf counters.
Implement ->get_driver_query_[group_]info() so that these counters are
exposed through the GL_AMD_performance_monitor extension.

Signed-off-by: Boris Brezillon <boris.brezillon at free-electrons.com>
---
Changes in v2 (all reported by Eric):
- Add missing "TLB-quads-passing-z-and-stencil-test" perf counter
- Make sure we wait for the results to be available before returning
  true in vc4_get_query_result()
- Flush pending jobs in vc4_begin_query() and vc4_end_query() so that
  perf counters are not polluted by unrelated jobs
- Reset the counters in vc4_begin_query()
- Initialize ->group_id in vc4_get_driver_query_info()
---
 src/gallium/drivers/vc4/vc4_context.h |  18 +++
 src/gallium/drivers/vc4/vc4_job.c     |   7 ++
 src/gallium/drivers/vc4/vc4_query.c   | 228 ++++++++++++++++++++++++++++++++--
 src/gallium/drivers/vc4/vc4_screen.c  |   7 ++
 src/gallium/drivers/vc4/vc4_screen.h  |   1 +
 5 files changed, 249 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 4a1e4093f1a0..41241d36a4bc 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -219,6 +219,13 @@ struct vc4_job_key {
         struct pipe_surface *zsbuf;
 };
 
+struct vc4_hwperfmon {
+        uint32_t id;
+        uint64_t last_seqno;
+        uint8_t events[DRM_VC4_MAX_PERF_COUNTERS];
+        uint64_t counters[DRM_VC4_MAX_PERF_COUNTERS];
+};
+
 /**
  * A complete bin/render job.
  *
@@ -306,6 +313,9 @@ struct vc4_job {
         /** Any flags to be passed in drm_vc4_submit_cl.flags. */
         uint32_t flags;
 
+	/* Performance monitor attached to this job. */
+	struct vc4_hwperfmon *perfmon;
+
         struct vc4_job_key key;
 };
 
@@ -387,6 +397,8 @@ struct vc4_context {
         struct pipe_viewport_state viewport;
         struct vc4_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
         struct vc4_vertexbuf_stateobj vertexbuf;
+
+        struct vc4_hwperfmon *perfmon;
         /** @} */
 };
 
@@ -444,6 +456,12 @@ vc4_sampler_state(struct pipe_sampler_state *psampler)
         return (struct vc4_sampler_state *)psampler;
 }
 
+int vc4_get_driver_query_group_info(struct pipe_screen *pscreen,
+                                    unsigned index,
+                                    struct pipe_driver_query_group_info *info);
+int vc4_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
+                              struct pipe_driver_query_info *info);
+
 struct pipe_context *vc4_context_create(struct pipe_screen *pscreen,
                                         void *priv, unsigned flags);
 void vc4_draw_init(struct pipe_context *pctx);
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index 7fe20c16bad9..f0a59781b298 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -90,6 +90,9 @@ vc4_job_create(struct vc4_context *vc4)
         job->draw_max_x = 0;
         job->draw_max_y = 0;
 
+        if (vc4->perfmon)
+                job->perfmon = vc4->perfmon;
+
         return job;
 }
 
@@ -453,6 +456,8 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job)
         submit.shader_rec_count = job->shader_rec_count;
         submit.uniforms = (uintptr_t)job->uniforms.base;
         submit.uniforms_size = cl_offset(&job->uniforms);
+	if (job->perfmon)
+		submit.perfmonid = job->perfmon->id;
 
         assert(job->draw_min_x != ~0 && job->draw_min_y != ~0);
         submit.min_x_tile = job->draw_min_x / job->tile_width;
@@ -485,6 +490,8 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job)
                         warned = true;
                 } else if (!ret) {
                         vc4->last_emit_seqno = submit.seqno;
+                        if (job->perfmon)
+                                job->perfmon->last_seqno = submit.seqno;
                 }
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c
index ddf8f8fb0c2c..6e4681e93ccb 100644
--- a/src/gallium/drivers/vc4/vc4_query.c
+++ b/src/gallium/drivers/vc4/vc4_query.c
@@ -22,8 +22,9 @@
  */
 
 /**
- * Stub support for occlusion queries.
+ * Expose V3D HW perf counters.
  *
+ * We also have code to fake support for occlusion queries.
  * Since we expose support for GL 2.0, we have to expose occlusion queries,
  * but the spec allows you to expose 0 query counter bits, so we just return 0
  * as the result of all our queries.
@@ -32,49 +33,252 @@
 
 struct vc4_query
 {
-        uint8_t pad;
+        unsigned num_queries;
+        struct vc4_hwperfmon *hwperfmon;
 };
 
+static const char *v3d_counter_names[] = {
+        "FEP-valid-primitives-no-rendered-pixels",
+        "FEP-valid-primitives-rendered-pixels",
+        "FEP-clipped-quads",
+        "FEP-valid-quads",
+        "TLB-quads-not-passing-stencil-test",
+        "TLB-quads-not-passing-z-and-stencil-test",
+        "TLB-quads-passing-z-and-stencil-test",
+        "TLB-quads-with-zero-coverage",
+        "TLB-quads-with-non-zero-coverage",
+        "TLB-quads-written-to-color-buffer",
+        "PTB-primitives-discarded-outside-viewport",
+        "PTB-primitives-need-clipping",
+        "PTB-primitives-discared-reversed",
+        "QPU-total-idle-clk-cycles",
+        "QPU-total-clk-cycles-vertex-coord-shading",
+        "QPU-total-clk-cycles-fragment-shading",
+        "QPU-total-clk-cycles-executing-valid-instr",
+        "QPU-total-clk-cycles-waiting-TMU",
+        "QPU-total-clk-cycles-waiting-scoreboard",
+        "QPU-total-clk-cycles-waiting-varyings",
+        "QPU-total-instr-cache-hit",
+        "QPU-total-instr-cache-miss",
+        "QPU-total-uniform-cache-hit",
+        "QPU-total-uniform-cache-miss",
+        "TMU-total-text-quads-processed",
+        "TMU-total-text-cache-miss",
+        "VPM-total-clk-cycles-VDW-stalled",
+        "VPM-total-clk-cycles-VCD-stalled",
+        "L2C-total-cache-hit",
+        "L2C-total-cache-miss",
+};
+
+int vc4_get_driver_query_group_info(struct pipe_screen *pscreen,
+                                    unsigned index,
+                                    struct pipe_driver_query_group_info *info)
+{
+        struct vc4_screen *screen = vc4_screen(pscreen);
+
+        if (!screen->has_perfmon_ioctl)
+                return 0;
+
+        if (!info)
+                return 1;
+
+        if (index > 0)
+                return 0;
+
+        info->name = "V3D counters";
+        info->max_active_queries = DRM_VC4_MAX_PERF_COUNTERS;
+        info->num_queries = ARRAY_SIZE(v3d_counter_names);
+        return 1;
+}
+
+int vc4_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
+                              struct pipe_driver_query_info *info)
+{
+        struct vc4_screen *screen = vc4_screen(pscreen);
+
+        if (!screen->has_perfmon_ioctl)
+                return 0;
+
+        if (!info)
+                return ARRAY_SIZE(v3d_counter_names);
+
+        if (index >= ARRAY_SIZE(v3d_counter_names))
+                return 0;
+
+        info->group_id = 0;
+        info->name = v3d_counter_names[index];
+        info->query_type = PIPE_QUERY_DRIVER_SPECIFIC + index;
+        info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
+        info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
+        info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
+        return 1;
+}
+
 static struct pipe_query *
-vc4_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
+vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
+                       unsigned *query_types)
 {
         struct vc4_query *query = calloc(1, sizeof(*query));
+        struct vc4_hwperfmon *hwperfmon;
+        unsigned i, nhwqueries = 0;
+
+        if (!query)
+                return NULL;
+
+        for (i = 0; i < num_queries; i++) {
+                if (query_types[i] >= PIPE_QUERY_DRIVER_SPECIFIC)
+                        nhwqueries++;
+        }
+
+        /* We can't mix HW and non-HW queries. */
+        if (nhwqueries && nhwqueries != num_queries)
+                return NULL;
+
+        if (!nhwqueries)
+                return (struct pipe_query *)query;
+
+        hwperfmon = calloc(1, sizeof(*hwperfmon));
+        if (!hwperfmon)
+                goto err_free_query;
+
+        for (i = 0; i < num_queries; i++)
+                hwperfmon->events[i] = query_types[i] -
+                                       PIPE_QUERY_DRIVER_SPECIFIC;
+
+        query->hwperfmon = hwperfmon;
+        query->num_queries = num_queries;
 
         /* Note that struct pipe_query isn't actually defined anywhere. */
         return (struct pipe_query *)query;
+
+err_free_query:
+        free(query);
+
+        return NULL;
+}
+
+static struct pipe_query *
+vc4_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
+{
+        return vc4_create_batch_query(ctx, 1, &query_type);
 }
 
 static void
-vc4_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
+vc4_destroy_query(struct pipe_context *pctx, struct pipe_query *pquery)
 {
+        struct vc4_context *ctx = vc4_context(pctx);
+        struct vc4_query *query = (struct vc4_query *)pquery;
+
+        if (query->hwperfmon && query->hwperfmon->id) {
+                if (query->hwperfmon->id) {
+                        struct drm_vc4_perfmon_destroy req = { };
+
+                        req.id = query->hwperfmon->id;
+                        vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_DESTROY,
+                                  &req);
+                }
+
+                free(query->hwperfmon);
+        }
+
         free(query);
 }
 
 static boolean
-vc4_begin_query(struct pipe_context *ctx, struct pipe_query *query)
+vc4_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
 {
+        struct vc4_query *query = (struct vc4_query *)pquery;
+        struct vc4_context *ctx = vc4_context(pctx);
+        struct drm_vc4_perfmon_create req = { };
+        unsigned i;
+        int ret;
+
+        if (!query->hwperfmon)
+                return true;
+
+        /* Only one perfmon can be activated per context. */
+        if (ctx->perfmon)
+                return false;
+
+        /* Reset the counters by destroying the previously allocated perfmon */
+        if (query->hwperfmon->id) {
+                struct drm_vc4_perfmon_destroy destroyreq = { };
+
+                destroyreq.id = query->hwperfmon->id;
+                vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_DESTROY, &destroyreq);
+        }
+
+        for (i = 0; i < query->num_queries; i++)
+                req.events[i] = query->hwperfmon->events[i];
+
+        req.ncounters = query->num_queries;
+        ret = vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_CREATE, &req);
+        if (ret)
+                return false;
+
+        query->hwperfmon->id = req.id;
+
+        /* Make sure all pendings jobs are flushed before activating the
+         * perfmon.
+         */
+        vc4_flush(pctx);
+        ctx->perfmon = query->hwperfmon;
         return true;
 }
 
 static bool
-vc4_end_query(struct pipe_context *ctx, struct pipe_query *query)
+vc4_end_query(struct pipe_context *pctx, struct pipe_query *pquery)
 {
+        struct vc4_query *query = (struct vc4_query *)pquery;
+        struct vc4_context *ctx = vc4_context(pctx);
+
+        if (!query->hwperfmon)
+                return true;
+
+        if (ctx->perfmon != query->hwperfmon)
+                return false;
+
+        /* Make sure all pendings jobs are flushed before deactivating the
+         * perfmon.
+         */
+        vc4_flush(pctx);
+        ctx->perfmon = NULL;
         return true;
 }
 
 static boolean
-vc4_get_query_result(struct pipe_context *ctx, struct pipe_query *query,
+vc4_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
                      boolean wait, union pipe_query_result *vresult)
 {
-        uint64_t *result = &vresult->u64;
+        struct vc4_context *ctx = vc4_context(pctx);
+        struct vc4_query *query = (struct vc4_query *)pquery;
+        struct drm_vc4_perfmon_get_values req;
+        unsigned i;
+        int ret;
+
+        if (!query->hwperfmon) {
+                vresult->u64 = 0;
+                return true;
+        }
 
-        *result = 0;
+        if (!vc4_wait_seqno(ctx->screen, query->hwperfmon->last_seqno,
+                            wait ? PIPE_TIMEOUT_INFINITE : 0, "perfmon"))
+                return false;
+
+        req.id = query->hwperfmon->id;
+        req.values_ptr = (uintptr_t)query->hwperfmon->counters;
+        ret = vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_GET_VALUES, &req);
+        if (ret)
+                return false;
+
+        for (i = 0; i < query->num_queries; i++)
+                vresult->batch[i].u64 = query->hwperfmon->counters[i];
 
         return true;
 }
 
 static void
-vc4_set_active_query_state(struct pipe_context *pipe, boolean enable)
+vc4_set_active_query_state(struct pipe_context *pctx, boolean enable)
 {
 }
 
@@ -82,10 +286,10 @@ void
 vc4_query_init(struct pipe_context *pctx)
 {
         pctx->create_query = vc4_create_query;
+        pctx->create_batch_query = vc4_create_batch_query;
         pctx->destroy_query = vc4_destroy_query;
         pctx->begin_query = vc4_begin_query;
         pctx->end_query = vc4_end_query;
         pctx->get_query_result = vc4_get_query_result;
-	pctx->set_active_query_state = vc4_set_active_query_state;
+        pctx->set_active_query_state = vc4_set_active_query_state;
 }
-
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index a42ba675c130..e0875dafb5ca 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -696,6 +696,8 @@ vc4_screen_create(int fd, struct renderonly *ro)
                 vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_THREADED_FS);
         screen->has_madvise =
                 vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_MADVISE);
+        screen->has_perfmon_ioctl =
+                vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_PERFMON);
 
         if (!vc4_get_chip_info(screen))
                 goto fail;
@@ -722,6 +724,11 @@ vc4_screen_create(int fd, struct renderonly *ro)
         pscreen->get_compiler_options = vc4_screen_get_compiler_options;
         pscreen->query_dmabuf_modifiers = vc4_screen_query_dmabuf_modifiers;
 
+        if (screen->has_perfmon_ioctl) {
+                pscreen->get_driver_query_group_info = vc4_get_driver_query_group_info;
+                pscreen->get_driver_query_info = vc4_get_driver_query_info;
+        }
+
         return pscreen;
 
 fail:
diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h
index 09d1c342ed19..0b884423ba5a 100644
--- a/src/gallium/drivers/vc4/vc4_screen.h
+++ b/src/gallium/drivers/vc4/vc4_screen.h
@@ -97,6 +97,7 @@ struct vc4_screen {
         bool has_threaded_fs;
         bool has_madvise;
         bool has_tiling_ioctl;
+        bool has_perfmon_ioctl;
 
         struct vc4_simulator_file *sim_file;
 };
-- 
2.11.0