[Mesa-dev] [PATCH 2/2] i965: Add support for GL_AMD_performance_monitor on Ironlake.

Kenneth Graunke kenneth at whitecape.org
Thu Apr 11 14:00:59 PDT 2013


Ironlake's counters are always enabled; userspace can simply send a
MI_REPROT_PERF_COUNT packet to take a snapshot of them.  This makes it
easy to implement.

The counters are documented in the source code for the intel-gpu-tools
intel_perf_counters utility.

Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/Makefile.sources         |   1 +
 src/mesa/drivers/dri/i965/brw_context.c            |   4 +
 src/mesa/drivers/dri/i965/brw_context.h            |   7 +
 src/mesa/drivers/dri/i965/brw_defines.h            |   7 +
 .../drivers/dri/i965/brw_performance_monitor.c     | 372 +++++++++++++++++++++
 src/mesa/drivers/dri/intel/intel_extensions.c      |   3 +
 6 files changed, 394 insertions(+)
 create mode 100644 src/mesa/drivers/dri/i965/brw_performance_monitor.c

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index be8d630..a9c2754 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -68,6 +68,7 @@ i965_FILES = \
 	brw_gs_state.c \
 	brw_lower_texture_gradients.cpp \
 	brw_misc_state.c \
+	brw_performance_monitor.c \
 	brw_program.c \
 	brw_primitive_restart.c \
 	brw_queryobj.c \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index ceaf325..b8bb1b5 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -415,6 +415,10 @@ brwCreateContext(int api,
    _mesa_initialize_dispatch_tables(ctx);
    _mesa_initialize_vbo_vtxfmt(ctx);
 
+   if (ctx->Extensions.AMD_performance_monitor) {
+      brw_init_performance_monitors(brw);
+   }
+
    return true;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 114c369..4a203a2 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1060,6 +1060,10 @@ struct brw_context
       bool begin_emitted;
    } query;
 
+   struct {
+      uint32_t total_counter_size;
+   } perfmon;
+
    int num_atoms;
    const struct brw_tracked_state **atoms;
 
@@ -1212,6 +1216,9 @@ void brw_upload_ubo_surfaces(struct brw_context *brw,
 			     struct gl_shader *shader,
 			     uint32_t *surf_offsets);
 
+/* brw_performance_monitor.c */
+void brw_init_performance_monitors(struct brw_context *brw);
+
 /* gen6_sol.c */
 void
 brw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index a13f9dc..1fea1d8 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1649,6 +1649,13 @@ enum brw_wm_barycentric_interp_mode {
 
 #define CMD_MI_FLUSH                  0x0200
 
+#define GEN5_MI_REPORT_PERF_COUNT ((0x26 << 23) | (3 - 2))
+/* DW0 */
+# define GEN5_MI_COUNTER_SET_0      (0 << 6)
+# define GEN5_MI_COUNTER_SET_1      (1 << 6)
+/* DW1 */
+# define MI_COUNTER_ADDRESS_GTT     (1 << 0)
+/* DW2: a user-defined report ID (written to the buffer but can be anything) */
 
 /* Bitfields for the URB_WRITE message, DW2 of message header: */
 #define URB_WRITE_PRIM_END		0x1
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
new file mode 100644
index 0000000..b351193
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
@@ -0,0 +1,372 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_performance_monitor.c
+ *
+ * Implementation of the GL_AMD_performance_monitor extension.
+ *
+ * Currently only for Ironlake.
+ */
+
+#include <limits.h>
+
+#include "main/bitset.h"
+#include "main/macros.h"
+#include "main/mtypes.h"
+#include "main/performance_monitor.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "intel_batchbuffer.h"
+
+/**
+ * i965 representation of a performance monitor object.
+ */
+struct brw_perf_monitor_object
+{
+   /** The base class. */
+   struct gl_perf_monitor_object base;
+
+   /**
+    * The GPU-facing BO, holding raw counter data in a hardware specific form.
+    */
+   drm_intel_bo *gpu_facing_bo;
+};
+
+/** Downcasting convenience macro. */
+static inline struct brw_perf_monitor_object *
+brw_perf_monitor(struct gl_perf_monitor_object *m)
+{
+   return (struct brw_perf_monitor_object *) m;
+}
+
+/******************************************************************************/
+
+/**
+ * Group information:
+ *  @{
+ */
+enum group_id {
+   A_COUNTERS = 0,
+};
+
+const static struct gl_perf_monitor_group perf_groups[] = {
+   [A_COUNTERS] = { "Aggregating Counters", INT_MAX }
+};
+/** @} */
+
+
+/**
+ * Ironlake counter information:
+ *  @{
+ */
+enum gen5_counter_id {
+   GEN5_FIRST_A_COUNTER = 0,
+
+   GEN5_CS_STARVED = GEN5_FIRST_A_COUNTER,
+   GEN5_CS_STALLED,
+   GEN5_VF_STARVED,
+   GEN5_VF_STALLED,
+   GEN5_VS_STARVED,
+   GEN5_VS_STALLED,
+   GEN5_GS_STARVED,
+   GEN5_GS_STALLED,
+   GEN5_CL_STARVED,
+   GEN5_CL_STALLED,
+   GEN5_SF_STARVED,
+   GEN5_SF_STALLED,
+   GEN5_WZ_STARVED,
+   GEN5_WZ_STALLED,
+   GEN5_Z_BUFFER_READ_WRITE,
+   GEN5_EU_ACTIVE,
+   GEN5_EU_SUSPENDED,
+   GEN5_THREADS_LOADED,
+   GEN5_FILTERING_ACTIVE,
+   GEN5_PS_EXECUTED,
+   GEN5_SUBSPANS_WRITTEN,
+   GEN5_BYTES_READ_FOR_TEXTURE_READS,
+   GEN5_TEXELS_RETURNED_FROM_SAMPLER,
+   GEN5_POLYGONS_NOT_CULLED,
+   GEN5_MASF_HAS_VALID_MESSAGE,
+   GEN5_WRITES_READS_FROM_RC,
+   GEN5_DP_READS,
+   GEN5_MASF_HAS_VALID_MESSAGE_NOT_CONSUMED_BY_SAMPLER,
+   GEN5_EU_STALLED_FOR_MATH,
+
+   GEN5_LAST_A_COUNTER = GEN5_FIRST_A_COUNTER + 28, 
+};
+
+#define A_COUNTER(id, name)     \
+   {                            \
+      .ID = id,                 \
+      .Name = name,             \
+      .GroupID = A_COUNTERS,    \
+      .Type = GL_UNSIGNED_INT,  \
+      .Minimum = { .u32 =  0 }, \
+      .Maximum = { .u32 = ~0 }, \
+   }
+
+const static struct gl_perf_monitor_counter gen5_counters[] = {
+   A_COUNTER(GEN5_CS_STARVED, "cycles the CS unit is starved"),
+   A_COUNTER(GEN5_CS_STALLED, "cycles the CS unit is stalled"),
+   A_COUNTER(GEN5_VF_STARVED, "cycles the VF unit is starved"),
+   A_COUNTER(GEN5_VF_STALLED, "cycles the VF unit is stalled"),
+   A_COUNTER(GEN5_VS_STARVED, "cycles the VS unit is starved"),
+   A_COUNTER(GEN5_VS_STALLED, "cycles the VS unit is stalled"),
+   A_COUNTER(GEN5_GS_STARVED, "cycles the GS unit is starved"),
+   A_COUNTER(GEN5_GS_STALLED, "cycles the GS unit is stalled"),
+   A_COUNTER(GEN5_CL_STARVED, "cycles the CL unit is starved"),
+   A_COUNTER(GEN5_CL_STALLED, "cycles the CL unit is stalled"),
+   A_COUNTER(GEN5_SF_STARVED, "cycles the SF unit is starved"),
+   A_COUNTER(GEN5_SF_STALLED, "cycles the SF unit is stalled"),
+   A_COUNTER(GEN5_WZ_STARVED, "cycles the WZ unit is starved"),
+   A_COUNTER(GEN5_WZ_STALLED, "cycles the WZ unit is stalled"),
+   A_COUNTER(GEN5_Z_BUFFER_READ_WRITE, "Z buffer read/write"),
+   A_COUNTER(GEN5_EU_ACTIVE, "cycles each EU was active"),
+   A_COUNTER(GEN5_EU_SUSPENDED, "cycles each EU was suspended"),
+   A_COUNTER(GEN5_THREADS_LOADED, "cycles threads loaded all EUs"),
+   A_COUNTER(GEN5_FILTERING_ACTIVE, "cycles filtering active"),
+   A_COUNTER(GEN5_PS_EXECUTED, "cycles PS threads executed"),
+   A_COUNTER(GEN5_SUBSPANS_WRITTEN, "subspans written to RC"),
+   A_COUNTER(GEN5_BYTES_READ_FOR_TEXTURE_READS, "bytes read for texture reads"),
+   A_COUNTER(GEN5_TEXELS_RETURNED_FROM_SAMPLER, "texels returned from sampler"),
+   A_COUNTER(GEN5_POLYGONS_NOT_CULLED, "polygons not culled"),
+   A_COUNTER(GEN5_MASF_HAS_VALID_MESSAGE, "clocks MASF has valid message"),
+   A_COUNTER(GEN5_WRITES_READS_FROM_RC, "64b writes/reads from RC"),
+   A_COUNTER(GEN5_DP_READS, "reads on dataport"),
+   A_COUNTER(GEN5_MASF_HAS_VALID_MESSAGE_NOT_CONSUMED_BY_SAMPLER, "clocks MASF has valid msg not consumed by sampler"),
+   A_COUNTER(GEN5_EU_STALLED_FOR_MATH, "cycles any EU is stalled for math"),
+};
+/** @} */
+
+/******************************************************************************/
+
+static void
+snapshot_aggregating_counters(struct brw_context *brw,
+                              drm_intel_bo *bo, uint32_t offset)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(6);
+   OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_0);
+   OUT_RELOC(bo,
+             I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+             offset);
+   OUT_BATCH(0);
+
+   OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | GEN5_MI_COUNTER_SET_1);
+   OUT_RELOC(bo,
+             I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+             offset + 64);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+}
+
+static bool
+aggregating_counters_needed(struct brw_context *brw,
+                            struct gl_perf_monitor_object *m)
+{
+   struct intel_context *intel = &brw->intel;
+
+   if (intel->gen == 5) {
+      return BITSET_TEST_RANGE(m->ActiveCounters,
+                               GEN5_FIRST_A_COUNTER, GEN5_LAST_A_COUNTER);
+   }
+   assert(!"Unsupported generation in performance counter code.");
+   return false;
+}
+
+/******************************************************************************/
+
+/**
+ * Create a new performance monitor object.
+ */
+static struct gl_perf_monitor_object *
+brw_new_perf_monitor()
+{
+   return calloc(1, sizeof(struct brw_perf_monitor_object));
+}
+
+/**
+ * Delete a performance monitor object.
+ */
+static void
+brw_delete_perf_monitor(struct gl_perf_monitor_object *m)
+{
+   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+
+   if (monitor->gpu_facing_bo)
+      drm_intel_bo_unreference(monitor->gpu_facing_bo);
+
+   free(monitor);
+}
+
+/**
+ * Driver hook for glBeginPerformanceMonitorAMD().
+ */
+static void
+brw_begin_perf_monitor(struct gl_context *ctx,
+                       struct gl_perf_monitor_object *m)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = &brw->intel;
+   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+
+   m->Active = true;
+
+   /* If the GPU-facing BO already exists, throw it away.  It contains
+    * old results and we're not interested in those any more.
+    */
+   if (monitor->gpu_facing_bo)
+      drm_intel_bo_unreference(monitor->gpu_facing_bo);
+
+   /* Create a new GPU-facing BO */
+   monitor->gpu_facing_bo =
+      drm_intel_bo_alloc(intel->bufmgr, "performance monitor", 4096, 1);
+
+   /* Take a shapshot of all active counters */
+   if (aggregating_counters_needed(brw, m)) {
+      snapshot_aggregating_counters(brw, monitor->gpu_facing_bo, 0);
+   }
+}
+
+/**
+ * Driver hook for glEndPerformanceMonitorAMD().
+ */
+static void
+brw_end_perf_monitor(struct gl_context *ctx,
+                     struct gl_perf_monitor_object *m)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+   if (aggregating_counters_needed(brw, m)) {
+      snapshot_aggregating_counters(brw, monitor->gpu_facing_bo,
+                                    brw->perfmon.total_counter_size);
+   }
+}
+
+/**
+ * Reset a performance monitor, throwing away any results.
+ */
+static void
+brw_reset_perf_monitor(struct gl_context *ctx,
+                       struct gl_perf_monitor_object *m)
+{
+   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+
+   if (monitor->gpu_facing_bo) {
+      drm_intel_bo_unreference(monitor->gpu_facing_bo);
+      monitor->gpu_facing_bo = NULL;
+   }
+
+   if (m->Active) {
+      brw_begin_perf_monitor(ctx, m);
+   }
+}
+
+/**
+ * Is a performance monitor result available?
+ */
+static GLboolean
+brw_is_perf_monitor_result_available(struct gl_perf_monitor_object *m)
+{
+   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+   return !m->Active && monitor->gpu_facing_bo &&
+          !drm_intel_bo_busy(monitor->gpu_facing_bo);
+}
+
+/**
+ * Get the performance monitor result.
+ */
+static void
+brw_get_perf_monitor_result(struct gl_context *ctx,
+                            struct gl_perf_monitor_object *m,
+                            GLsizei data_size,
+                            GLuint *data,
+                            GLint *bytes_written)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_perf_monitor_object *monitor = brw_perf_monitor(m);
+
+   /* This hook should only be called when results are available. */
+   assert(monitor->gpu_facing_bo != NULL);
+
+   drm_intel_bo_map(monitor->gpu_facing_bo, false);
+   unsigned *gpu_bo = monitor->gpu_facing_bo->virtual;
+
+   /* Copy data from the GPU-facing BO to the supplied array.
+    *
+    * The output data format is: <group ID, counter ID, value> for each
+    * active counter.  The API allows counters to appear in any order.
+    */
+   GLsizei offset = 0;
+   for (int i = 0; i < ctx->PerfMonitor.NumCounters; i++) {
+      const struct gl_perf_monitor_counter *c = &ctx->PerfMonitor.Counters[i];
+
+      if (!BITSET_TEST(m->ActiveCounters, i))
+         continue;
+
+      data[offset++] = c->GroupID;
+      data[offset++] = c->ID;
+
+      /* Skip REPORT_ID and TIMESTAMP fields. */
+      uint32_t first_index = (3 + c->ID) * sizeof(uint32_t);
+      uint32_t second_index = brw->perfmon.total_counter_size + first_index;
+
+      /* Won't work for uint64_t values, but we don't have any */
+      data[offset] = gpu_bo[second_index] - gpu_bo[first_index];
+      offset += _mesa_perf_monitor_counter_size(c) / sizeof(uint32_t);
+   }
+
+   drm_intel_bo_unmap(monitor->gpu_facing_bo);
+
+   if (bytes_written)
+      *bytes_written = offset * sizeof(uint32_t);
+}
+
+void
+brw_init_performance_monitors(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
+
+   ctx->Driver.NewPerfMonitor = brw_new_perf_monitor;
+   ctx->Driver.DeletePerfMonitor = brw_delete_perf_monitor;
+   ctx->Driver.BeginPerfMonitor = brw_begin_perf_monitor;
+   ctx->Driver.EndPerfMonitor = brw_end_perf_monitor;
+   ctx->Driver.ResetPerfMonitor = brw_reset_perf_monitor;
+   ctx->Driver.IsPerfMonitorResultAvailable = brw_is_perf_monitor_result_available;
+   ctx->Driver.GetPerfMonitorResult = brw_get_perf_monitor_result;
+
+   if (intel->gen == 5) {
+      ctx->PerfMonitor.Groups = perf_groups;
+      ctx->PerfMonitor.NumGroups = ARRAY_SIZE(perf_groups);
+
+      ctx->PerfMonitor.Counters = gen5_counters;
+      ctx->PerfMonitor.NumCounters = ARRAY_SIZE(gen5_counters);
+
+      brw->perfmon.total_counter_size =
+         (3 + ctx->PerfMonitor.NumCounters) * sizeof(uint32_t);
+   }
+}
diff --git a/src/mesa/drivers/dri/intel/intel_extensions.c b/src/mesa/drivers/dri/intel/intel_extensions.c
index 1ad728a..3ef7fb9 100755
--- a/src/mesa/drivers/dri/intel/intel_extensions.c
+++ b/src/mesa/drivers/dri/intel/intel_extensions.c
@@ -111,6 +111,9 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_texture_storage_multisample = true;
    }
 
+   if (intel->gen == 5)
+      ctx->Extensions.AMD_performance_monitor = true;
+
    if (intel->gen >= 5) {
       ctx->Extensions.ARB_texture_query_lod = true;
       ctx->Extensions.EXT_timer_query = true;
-- 
1.8.2.1



More information about the mesa-dev mailing list