Mesa (master): nvc0: implement MP performance counters

Christoph Bumiller chrisbmr at kemper.freedesktop.org
Fri Mar 29 00:11:59 UTC 2013


Module: Mesa
Branch: master
Commit: ee624ced364bfd2f896809874ef3a808a11c5ecf
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=ee624ced364bfd2f896809874ef3a808a11c5ecf

Author: Christoph Bumiller <e0425955 at student.tuwien.ac.at>
Date:   Wed Mar 27 23:39:06 2013 +0100

nvc0: implement MP performance counters

There's more, but this only adds (most) of the counters that are
handled directly by the shader processors.
The other counter domains are not handled on the multiprocessor and
there are no FIFO object methods for configuring them.
Instead, they have to be programmed by the kernel via PCOUNTER, and
the interface for this isn't in place yet.

---

 src/gallium/drivers/nvc0/nvc0_context.h     |    3 +-
 src/gallium/drivers/nvc0/nvc0_program.c     |    6 +-
 src/gallium/drivers/nvc0/nvc0_query.c       |  393 ++++++++++++++++++++++++++-
 src/gallium/drivers/nvc0/nvc0_screen.c      |    6 +
 src/gallium/drivers/nvc0/nvc0_screen.h      |   71 +++++
 src/gallium/drivers/nvc0/nvc0_winsys.h      |    2 +
 src/gallium/drivers/nvc0/nve4_compute.c     |   11 +-
 src/gallium/drivers/nvc0/nve4_compute.xml.h |   70 +++++
 8 files changed, 556 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/nvc0/nvc0_context.h b/src/gallium/drivers/nvc0/nvc0_context.h
index d9aa378..799d9b9 100644
--- a/src/gallium/drivers/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nvc0/nvc0_context.h
@@ -84,7 +84,8 @@
 #define NVC0_BIND_CP_GLOBAL      49
 #define NVC0_BIND_CP_DESC        50
 #define NVC0_BIND_CP_SCREEN      51
-#define NVC0_BIND_CP_COUNT       52
+#define NVC0_BIND_CP_QUERY       52
+#define NVC0_BIND_CP_COUNT       53
 
 /* bufctx for other operations */
 #define NVC0_BIND_2D            0
diff --git a/src/gallium/drivers/nvc0/nvc0_program.c b/src/gallium/drivers/nvc0/nvc0_program.c
index 22dfaf9..15f683c 100644
--- a/src/gallium/drivers/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nvc0/nvc0_program.c
@@ -777,8 +777,8 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)
 
    if (prog->mem)
       nouveau_heap_free(&prog->mem);
-
-   FREE(prog->code);
+   if (prog->code)
+      FREE(prog->code); /* may be 0 for hardcoded shaders */
    FREE(prog->immd_data);
    FREE(prog->relocs);
    if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms)
@@ -807,5 +807,5 @@ nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label)
    for (i = 0; i < prog->cp.num_syms; ++i)
       if (syms[i].label == label)
          return prog->code_base + base + syms[i].offset;
-   return ~0;
+   return prog->code_base; /* no symbols or symbol not found */
 }
diff --git a/src/gallium/drivers/nvc0/nvc0_query.c b/src/gallium/drivers/nvc0/nvc0_query.c
index d329148..5c4431e 100644
--- a/src/gallium/drivers/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nvc0/nvc0_query.c
@@ -26,6 +26,7 @@
 
 #include "nvc0_context.h"
 #include "nouveau/nv_object.xml.h"
+#include "nve4_compute.xml.h"
 
 #define NVC0_QUERY_STATE_READY   0
 #define NVC0_QUERY_STATE_ACTIVE  1
@@ -36,6 +37,7 @@ struct nvc0_query {
    uint32_t *data;
    uint16_t type;
    uint16_t index;
+   int8_t ctr[4];
    uint32_t sequence;
    struct nouveau_bo *bo;
    uint32_t base;
@@ -49,6 +51,11 @@ struct nvc0_query {
 
 #define NVC0_QUERY_ALLOC_SPACE 256
 
+static void nve4_mp_pm_query_begin(struct nvc0_context *, struct nvc0_query *);
+static void nve4_mp_pm_query_end(struct nvc0_context *, struct nvc0_query *);
+static boolean nve4_mp_pm_query_result(struct nvc0_context *,
+                                       struct nvc0_query *, void *, boolean);
+
 static INLINE struct nvc0_query *
 nvc0_query(struct pipe_query *pipe)
 {
@@ -132,6 +139,16 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type)
       space = 16;
       break;
    default:
+      if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS &&
+          nvc0->screen->base.device->drm_version >= 0x01000101) {
+         if (type >= NVE4_PM_QUERY(0) &&
+             type <= NVE4_PM_QUERY_MAX) {
+            /* 8 counters per MP + clock */
+            space = 12 * nvc0->screen->mp_count * sizeof(uint32_t);
+            break;
+         }
+      }
+      debug_printf("invalid query type: %u\n", type);
       FREE(q);
       return NULL;
    }
@@ -244,6 +261,8 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
       nvc0_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
       break;
    default:
+      if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_MAX)
+         nve4_mp_pm_query_begin(nvc0, q);
       break;
    }
    q->state = NVC0_QUERY_STATE_ACTIVE;
@@ -314,7 +333,8 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5));
       break;
    default:
-      assert(0);
+      if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_MAX)
+         nve4_mp_pm_query_end(nvc0, q);
       break;
    }
 }
@@ -343,6 +363,9 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    uint64_t *data64 = (uint64_t *)q->data;
    unsigned i;
 
+   if (q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_MAX)
+      return nve4_mp_pm_query_result(nvc0, q, result, wait);
+
    if (q->state != NVC0_QUERY_STATE_READY)
       nvc0_query_update(nvc0->screen->base.client, q);
 
@@ -399,6 +422,7 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       res32[0] = q->data[1];
       break;
    default:
+      assert(0); /* can't happen, we don't create queries with invalid type */
       return FALSE;
    }
 
@@ -513,6 +537,373 @@ nvc0_so_target_save_offset(struct pipe_context *pipe,
    nvc0_query_end(pipe, targ->pq);
 }
 
+
+/* === PERFORMANCE MONITORING COUNTERS === */
+
+/* Code to read out MP counters: They are accessible via mmio, too, but let's
+ * just avoid mapping registers in userspace. We'd have to know which MPs are
+ * enabled/present, too, and that information is not presently exposed.
+ * We could add a kernel interface for it, but reading the counters like this
+ * has the advantage of being async (if get_result isn't called immediately).
+ */
+static const uint64_t nve4_read_mp_pm_counters_code[] =
+{
+   0x2042004270420047ULL, /* sched */
+   0x2800400000001de4ULL, /* mov b32 $r0 c0[0] (04) */
+   0x2c0000000c009c04ULL, /* mov b32 $r2 $physid (20) */
+   0x2800400010005de4ULL, /* mov b32 $r1 c0[4] (04) */
+   0x2c0000008400dc04ULL, /* mov b32 $r3 $tidx (27) */
+   0x7000c01050209c03ULL, /* ext u32 $r2 $r2 0x0414 (04) */
+   0x2c00000010011c04ULL, /* mov b32 $r4 $pm0 (20) */
+   0x190e0000fc33dc03ULL, /* set $p1 eq u32 $r3 0 (04) */
+   0x2280428042804277ULL, /* sched */
+   0x2c00000014015c04ULL, /* mov b32 $r5 $pm1 (27) */
+   0x10000000c0209c02ULL, /* mul $r2 u32 $r2 u32 48 (04) */
+   0x2c00000018019c04ULL, /* mov b32 $r6 $pm2 (28) */
+   0x4801000008001c03ULL, /* add b32 ($r0 $c) $r0 $r2 (04) */
+   0x2c0000001c01dc04ULL, /* mov b32 $r7 $pm3 (28) */
+   0x0800000000105c42ULL, /* add b32 $r1 $r1 0 $c (04) */
+   0x2c00000140009c04ULL, /* mov b32 $r2 $clock (28) */
+   0x2042804200420047ULL, /* sched */
+   0x94000000000107c5ULL, /* $p1 st b128 wt g[$r0d] $r4q (04) */
+   0x2c00000020011c04ULL, /* mov b32 $r4 $pm4 (20) */
+   0x2c00000024015c04ULL, /* mov b32 $r5 $pm5 (04) */
+   0x2c00000028019c04ULL, /* mov b32 $r6 $pm6 (20) */
+   0x2c0000002c01dc04ULL, /* mov b32 $r7 $pm7 (04) */
+   0x2c0000014400dc04ULL, /* mov b32 $r3 $clockhi (28) */
+   0x94000000400107c5ULL, /* $p1 st b128 wt g[$r0d+16] $r4q (04) */
+   0x200002e042804207ULL, /* sched */
+   0x2800400020011de4ULL, /* mov b32 $r4 c0[8] (20) */
+   0x2c0000000c015c04ULL, /* mov b32 $r5 $physid (04) */
+   0x94000000800087a5ULL, /* $p1 st b64 wt g[$r0d+32] $r2d (28) */
+   0x94000000a00107a5ULL, /* $p1 st b64 wt g[$r0d+40] $r4d (04) */
+   0x8000000000001de7ULL  /* exit (2e) */
+};
+
+/* NOTE: intentionally using the same names as NV */
+static const char *nve4_pm_query_names[] =
+{
+   /* MP counters */
+   "prof_trigger_00",
+   "prof_trigger_01",
+   "prof_trigger_02",
+   "prof_trigger_03",
+   "prof_trigger_04",
+   "prof_trigger_05",
+   "prof_trigger_06",
+   "prof_trigger_07",
+   "warps_launched",
+   "threads_launched",
+   "sm_cta_launched",
+   "inst_issued1",
+   "inst_issued2",
+   "inst_executed",
+   "local_load",
+   "local_store",
+   "shared_load",
+   "shared_store",
+   "l1_local_load_hit",
+   "l1_local_load_miss",
+   "l1_local_store_hit",
+   "l1_local_store_miss",
+   "gld_request",
+   "gst_request",
+   "l1_global_load_hit",
+   "l1_global_load_miss",
+   "uncached_global_load_transaction",
+   "global_store_transaction",
+   "branch",
+   "divergent_branch",
+   "active_warps",
+   "active_cycles"
+};
+
+/* For simplicity, we will allocate as many group slots as we allocate counter
+ * slots. This means that a single counter which wants to source from 2 groups
+ * will have to be declared as using 2 counter slots. This shouldn't really be
+ * a problem because such queries don't make much sense ... (unless someone is
+ * really creative).
+ */
+struct nve4_mp_counter_cfg
+{
+   uint32_t func    : 16; /* mask or 4-bit logic op (depending on mode) */
+   uint32_t mode    : 4;  /* LOGOP,B6,LOGOP_B6(_PULSE) */
+   uint32_t pad     : 3;
+   uint32_t sig_dom : 1;  /* if 0, MP_PM_A, if 1, MP_PM_B */
+   uint32_t sig_sel : 8;  /* signal group */
+   uint32_t src_sel : 32; /* signal selection for up to 5 sources */
+};
+
+struct nve4_mp_pm_query_cfg
+{
+   struct nve4_mp_counter_cfg ctr[4];
+   uint8_t num_counters;
+   uint8_t op; /* PIPE_LOGICOP_CLEAR(for ADD),OR,AND */
+};
+
+#define _Q1A(n, f, m, g, s) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, PIPE_LOGICOP_CLEAR }
+#define _Q1B(n, f, m, g, s) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, PIPE_LOGICOP_CLEAR }
+
+static const struct nve4_mp_pm_query_cfg nve4_mp_pm_queries[] =
+{
+   _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000),
+   _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004),
+   _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008),
+   _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c),
+   _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010),
+   _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014),
+   _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018),
+   _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c),
+   _Q1A(LAUNCHED_WARPS,    0x0001, B6, LAUNCH, 0x00000004),
+   _Q1A(LAUNCHED_THREADS,  0x003f, B6, LAUNCH, 0x398a4188),
+   _Q1B(LAUNCHED_CTA,      0x0001, B6, WARP, 0x0000001c),
+   _Q1A(INST_ISSUED1,  0x0001, B6, ISSUE, 0x00000004),
+   _Q1A(INST_ISSUED2,  0x0001, B6, ISSUE, 0x00000008),
+   _Q1A(INST_EXECUTED, 0x0003, B6, EXEC,  0x00000398),
+   _Q1A(LD_SHARED,   0x0001, B6, LDST, 0x00000000),
+   _Q1A(ST_SHARED,   0x0001, B6, LDST, 0x00000004),
+   _Q1A(LD_LOCAL,    0x0001, B6, LDST, 0x00000008),
+   _Q1A(ST_LOCAL,    0x0001, B6, LDST, 0x0000000c),
+   _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010),
+   _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014),
+   _Q1B(L1_LOCAL_LOAD_HIT,   0x0001, B6, L1, 0x00000000),
+   _Q1B(L1_LOCAL_LOAD_MISS,  0x0001, B6, L1, 0x00000004),
+   _Q1B(L1_LOCAL_STORE_HIT,  0x0001, B6, L1, 0x00000008),
+   _Q1B(L1_LOCAL_STORE_MISS, 0x0001, B6, L1, 0x0000000c),
+   _Q1B(L1_GLOBAL_LOAD_HIT,  0x0001, B6, L1, 0x00000010),
+   _Q1B(L1_GLOBAL_LOAD_MISS, 0x0001, B6, L1, 0x00000014),
+   _Q1B(GLD_TRANSACTIONS_UNCACHED, 0x0001, B6, MEM, 0x00000000),
+   _Q1B(GST_TRANSACTIONS,          0x0001, B6, MEM, 0x00000004),
+   _Q1A(BRANCH,           0x0001, B6, BRANCH, 0x0000000c),
+   _Q1A(BRANCH_DIVERGENT, 0x0001, B6, BRANCH, 0x00000010),
+   _Q1B(ACTIVE_WARPS,  0x003f, B6, WARP, 0x398a4188),
+   _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000004)
+};
+
+#undef _Q1A
+#undef _Q1B
+
+void
+nve4_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   const struct nve4_mp_pm_query_cfg *cfg;
+   unsigned i, c;
+   unsigned num_ab[2] = { 0, 0 };
+
+   cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+
+   /* check if we have enough free counter slots */
+   for (i = 0; i < cfg->num_counters; ++i)
+      num_ab[cfg->ctr[i].sig_dom]++;
+
+   if (screen->pm.num_mp_pm_active[0] + num_ab[0] > 4 ||
+       screen->pm.num_mp_pm_active[1] + num_ab[1] > 4) {
+      NOUVEAU_ERR("Not enough free MP counter slots !\n");
+      return;
+   }
+
+   assert(cfg->num_counters <= 4);
+   PUSH_SPACE(push, 4 * 8 + 6);
+
+   if (!screen->pm.mp_counters_enabled) {
+      screen->pm.mp_counters_enabled = TRUE;
+      BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
+      PUSH_DATA (push, 0x1fcb);
+   }
+
+   /* set sequence field to 0 (used to check if result is available) */
+   for (i = 0; i < screen->mp_count; ++i)
+      q->data[i * 10 + 10] = 0;
+
+   for (i = 0; i < cfg->num_counters; ++i) {
+      const unsigned d = cfg->ctr[i].sig_dom;
+
+      if (!screen->pm.num_mp_pm_active[d]) {
+         uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
+         if (screen->pm.num_mp_pm_active[!d])
+            m |= 1 << (7 + (8 * d));
+         BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
+         PUSH_DATA (push, m);
+      }
+      screen->pm.num_mp_pm_active[d]++;
+
+      for (c = d * 4; c < (d * 4 + 4); ++c) {
+         if (!screen->pm.mp_counter[c]) {
+            q->ctr[i] = c;
+            screen->pm.mp_counter[c] = (struct pipe_query *)q;
+            break;
+         }
+      }
+      assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
+
+      /* configure and reset the counter(s) */
+      if (d == 0)
+         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
+      else
+         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
+      PUSH_DATA (push, cfg->ctr[i].sig_sel);
+      BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
+      PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
+      BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
+      PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
+      BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
+      PUSH_DATA (push, 0);
+   }
+}
+
+static void
+nve4_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct pipe_context *pipe = &nvc0->base.pipe;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   uint32_t mask;
+   uint32_t input[3];
+   const uint block[3] = { 32, 1, 1 };
+   const uint grid[3] = { screen->mp_count, 1, 1 };
+   unsigned c;
+   const struct nve4_mp_pm_query_cfg *cfg;
+
+   cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+
+   if (unlikely(!screen->pm.prog)) {
+      struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
+      prog->type = PIPE_SHADER_COMPUTE;
+      prog->translated = TRUE;
+      prog->num_gprs = 8;
+      prog->code = (uint32_t *)nve4_read_mp_pm_counters_code;
+      prog->code_size = sizeof(nve4_read_mp_pm_counters_code);
+      prog->parm_size = 12;
+      screen->pm.prog = prog;
+   }
+
+   /* disable all counting */
+   PUSH_SPACE(push, 8);
+   for (c = 0; c < 8; ++c)
+      if (screen->pm.mp_counter[c])
+         IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
+   /* release counters for this query */
+   for (c = 0; c < 8; ++c) {
+      if (nvc0_query(screen->pm.mp_counter[c]) == q) {
+         screen->pm.num_mp_pm_active[c / 4]--;
+         screen->pm.mp_counter[c] = NULL;
+      }
+   }
+
+   BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
+                q->bo);
+
+   pipe->bind_compute_state(pipe, screen->pm.prog);
+   input[0] = (q->bo->offset + q->base);
+   input[1] = (q->bo->offset + q->base) >> 32;
+   input[2] = q->sequence;
+   pipe->launch_grid(pipe, block, grid, 0, input);
+
+   nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
+
+   /* re-activate other counters */
+   PUSH_SPACE(push, 16);
+   mask = 0;
+   for (c = 0; c < 8; ++c) {
+      unsigned i;
+      q = nvc0_query(screen->pm.mp_counter[c]);
+      if (!q)
+         continue;
+      cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+      for (i = 0; i < cfg->num_counters; ++i) {
+         if (mask & (1 << q->ctr[i]))
+            break;
+         mask |= 1 << q->ctr[i];
+         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(q->ctr[i])), 1);
+         PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
+      }
+   }
+}
+
+static boolean
+nve4_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
+                        void *result, boolean wait)
+{
+   uint32_t count[4];
+   uint64_t value = 0;
+   unsigned p, c;
+   const struct nve4_mp_pm_query_cfg *cfg;
+
+   cfg = &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+
+   for (p = 0; p < nvc0->screen->mp_count_compute; ++p) {
+      uint64_t clock;
+      const unsigned b = p * 12;
+
+      clock = *(uint64_t *)&q->data[b + 8];
+      (void)clock; /* might be interesting one day */
+
+      if (q->data[b + 10] != q->sequence) {
+         /* WARNING: This will spin forever if you loop with wait == FALSE and
+          * the push buffer hasn't been flushed !
+          */
+         if (!wait)
+            return FALSE;
+         if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
+            return FALSE;
+      }
+
+      for (c = 0; c < cfg->num_counters; ++c)
+         count[c] = q->data[b + q->ctr[c]];
+      for (; c < 4; ++c)
+         count[c] = 0;
+
+      switch (cfg->op) {
+      case PIPE_LOGICOP_AND:
+         value &= count[0] & count[1] & count[2] & count[3];
+         break;
+      case PIPE_LOGICOP_OR:
+         value |= count[0] | count[1] | count[2] | count[3];
+         break;
+      case PIPE_LOGICOP_CLEAR: /* abused as ADD */
+      default:
+         value += count[0] + count[1] + count[2] + count[3];
+         break;
+      }
+   }
+   *(uint64_t *)result = value;
+   return TRUE;
+}
+
+int
+nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
+                                  unsigned id,
+                                  struct pipe_driver_query_info *info)
+{
+   struct nvc0_screen *screen = nvc0_screen(pscreen);
+
+   if (screen->base.class_3d >= NVE4_3D_CLASS) {
+      unsigned count = 0;
+      if (screen->base.device->drm_version >= 0x01000101)
+         count = NVE4_PM_QUERY_COUNT;
+      if (!info)
+         return count;
+      if (id < count) {
+         info->name = nve4_pm_query_names[id];
+         info->query_type = NVE4_PM_QUERY(id);
+         info->max_value = ~0ULL;
+         info->uses_byte_units = FALSE;
+         return 1;
+      }
+   } else {
+      if (!info)
+         return 0;
+   }
+   /* user asked for info about non-existing query */
+   info->name = "this_is_not_the_query_you_are_looking_for";
+   info->query_type = 0xdeadd01d;
+   info->max_value = 0;
+   info->uses_byte_units = FALSE;
+   return 0;
+}
+
 void
 nvc0_init_query_functions(struct nvc0_context *nvc0)
 {
diff --git a/src/gallium/drivers/nvc0/nvc0_screen.c b/src/gallium/drivers/nvc0/nvc0_screen.c
index bf353c4..5b9385a 100644
--- a/src/gallium/drivers/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nvc0/nvc0_screen.c
@@ -352,6 +352,10 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
 
    if (screen->blitter)
       nvc0_blitter_destroy(screen);
+   if (screen->pm.prog) {
+      screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
+      nvc0_program_destroy(NULL, screen->pm.prog);
+   }
 
    nouveau_bo_ref(NULL, &screen->text);
    nouveau_bo_ref(NULL, &screen->uniform_bo);
@@ -581,6 +585,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    pscreen->get_param = nvc0_screen_get_param;
    pscreen->get_shader_param = nvc0_screen_get_shader_param;
    pscreen->get_paramf = nvc0_screen_get_paramf;
+   pscreen->get_driver_query_info = nvc0_screen_get_driver_query_info;
 
    nvc0_screen_init_resource_functions(pscreen);
 
@@ -785,6 +790,7 @@ nvc0_screen_create(struct nouveau_device *dev)
          value = (16 << 8) | 4;
    }
    screen->mp_count = value >> 8;
+   screen->mp_count_compute = screen->mp_count;
 
    nvc0_screen_resize_tls_area(screen, 128 * 16, 0, 0x200);
 
diff --git a/src/gallium/drivers/nvc0/nvc0_screen.h b/src/gallium/drivers/nvc0/nvc0_screen.h
index 13dc83e..b7cfd05 100644
--- a/src/gallium/drivers/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nvc0/nvc0_screen.h
@@ -39,6 +39,7 @@ struct nvc0_screen {
    struct nouveau_bo *poly_cache;
 
    uint16_t mp_count;
+   uint16_t mp_count_compute; /* magic reg can make compute use fewer MPs */
 
    struct nouveau_heap *text_heap;
    struct nouveau_heap *lib_code; /* allocated from text_heap */
@@ -62,6 +63,13 @@ struct nvc0_screen {
       uint32_t *map;
    } fence;
 
+   struct {
+      struct nvc0_program *prog; /* compute state object to read MP counters */
+      struct pipe_query *mp_counter[8]; /* counter to query allocation */
+      uint8_t num_mp_pm_active[2];
+      boolean mp_counters_enabled;
+   } pm;
+
    struct nouveau_mman *mm_VRAM_fe0;
 
    struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */
@@ -76,6 +84,69 @@ nvc0_screen(struct pipe_screen *screen)
    return (struct nvc0_screen *)screen;
 }
 
+
+/* Performance counter queries:
+ */
+#define NVE4_PM_QUERY_COUNT  32
+#define NVE4_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
+#define NVE4_PM_QUERY_MAX    NVE4_PM_QUERY(NVE4_PM_QUERY_COUNT - 1)
+/* MP (NOTE: these are also used to index a table, so put them first) */
+#define NVE4_PM_QUERY_PROF_TRIGGER_0            0
+#define NVE4_PM_QUERY_PROF_TRIGGER_1            1
+#define NVE4_PM_QUERY_PROF_TRIGGER_2            2
+#define NVE4_PM_QUERY_PROF_TRIGGER_3            3
+#define NVE4_PM_QUERY_PROF_TRIGGER_4            4
+#define NVE4_PM_QUERY_PROF_TRIGGER_5            5
+#define NVE4_PM_QUERY_PROF_TRIGGER_6            6
+#define NVE4_PM_QUERY_PROF_TRIGGER_7            7
+#define NVE4_PM_QUERY_LAUNCHED_WARPS            8
+#define NVE4_PM_QUERY_LAUNCHED_THREADS          9
+#define NVE4_PM_QUERY_LAUNCHED_CTA              10
+#define NVE4_PM_QUERY_INST_ISSUED1              11
+#define NVE4_PM_QUERY_INST_ISSUED2              12
+#define NVE4_PM_QUERY_INST_EXECUTED             13
+#define NVE4_PM_QUERY_LD_LOCAL                  14
+#define NVE4_PM_QUERY_ST_LOCAL                  15
+#define NVE4_PM_QUERY_LD_SHARED                 16
+#define NVE4_PM_QUERY_ST_SHARED                 17
+#define NVE4_PM_QUERY_L1_LOCAL_LOAD_HIT         18
+#define NVE4_PM_QUERY_L1_LOCAL_LOAD_MISS        19
+#define NVE4_PM_QUERY_L1_LOCAL_STORE_HIT        20
+#define NVE4_PM_QUERY_L1_LOCAL_STORE_MISS       21
+#define NVE4_PM_QUERY_GLD_REQUEST               22
+#define NVE4_PM_QUERY_GST_REQUEST               23
+#define NVE4_PM_QUERY_L1_GLOBAL_LOAD_HIT        24
+#define NVE4_PM_QUERY_L1_GLOBAL_LOAD_MISS       25
+#define NVE4_PM_QUERY_GLD_TRANSACTIONS_UNCACHED 26
+#define NVE4_PM_QUERY_GST_TRANSACTIONS          27
+#define NVE4_PM_QUERY_BRANCH                    28
+#define NVE4_PM_QUERY_BRANCH_DIVERGENT          29
+#define NVE4_PM_QUERY_ACTIVE_WARPS              30
+#define NVE4_PM_QUERY_ACTIVE_CYCLES             31
+/* Engines (PCOUNTER) */
+/*
+#define NVE4_PM_QUERY_GR_IDLE                   50
+#define NVE4_PM_QUERY_BSP_IDLE                  51
+#define NVE4_PM_QUERY_VP_IDLE                   52
+#define NVE4_PM_QUERY_PPP_IDLE                  53
+#define NVE4_PM_QUERY_CE0_IDLE                  54
+#define NVE4_PM_QUERY_CE1_IDLE                  55
+#define NVE4_PM_QUERY_CE2_IDLE                  56
+*/
+/* L2 queries (PCOUNTER) */
+/*
+#define NVE4_PM_QUERY_L2_SUBP_WRITE_L1_SECTOR_QUERIES 57
+...
+*/
+/* TEX queries (PCOUNTER) */
+/*
+#define NVE4_PM_QUERY_TEX0_CACHE_SECTOR_QUERIES 58
+...
+*/
+
+int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned,
+                                      struct pipe_driver_query_info *);
+
 boolean nvc0_blitter_create(struct nvc0_screen *);
 void nvc0_blitter_destroy(struct nvc0_screen *);
 
diff --git a/src/gallium/drivers/nvc0/nvc0_winsys.h b/src/gallium/drivers/nvc0/nvc0_winsys.h
index c13ebd5..25183a4 100644
--- a/src/gallium/drivers/nvc0/nvc0_winsys.h
+++ b/src/gallium/drivers/nvc0/nvc0_winsys.h
@@ -65,6 +65,8 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 #define SUBC_COPY(m) 4, (m)
 #define NVE4_COPY(m) SUBC_COPY(NVE4_COPY_##n)
 
+#define SUBC_SW(m) 7, (m)
+
 static INLINE uint32_t
 NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size)
 {
diff --git a/src/gallium/drivers/nvc0/nve4_compute.c b/src/gallium/drivers/nvc0/nve4_compute.c
index 943ae78..89da7d5 100644
--- a/src/gallium/drivers/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nvc0/nve4_compute.c
@@ -27,7 +27,9 @@
 
 #include "nv50/codegen/nv50_ir_driver.h"
 
+#ifdef DEBUG
 static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *);
+#endif
 
 
 int
@@ -477,7 +479,10 @@ nve4_launch_grid(struct pipe_context *pipe,
       goto out;
 
    nve4_compute_setup_launch_desc(nvc0, desc, label, block_layout, grid_layout);
-   nve4_compute_dump_launch_desc(desc);
+#ifdef DEBUG
+   if (debug_get_num_option("NV50_PROG_DEBUG", 0))
+      nve4_compute_dump_launch_desc(desc);
+#endif
 
    nve4_compute_upload_input(nvc0, input, block_layout, grid_layout);
 
@@ -589,6 +594,7 @@ static const char *nve4_cache_split_name(unsigned value)
    }
 }
 
+#ifdef DEBUG
 static void
 nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
 {
@@ -635,7 +641,9 @@ nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
                    i, address, size, valid ? "" : "  (invalid)");
    }
 }
+#endif
 
+#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
 static void
 nve4_compute_trap_info(struct nvc0_context *nvc0)
 {
@@ -667,3 +675,4 @@ nve4_compute_trap_info(struct nvc0_context *nvc0)
    }
    info->lock = 0;
 }
+#endif
diff --git a/src/gallium/drivers/nvc0/nve4_compute.xml.h b/src/gallium/drivers/nvc0/nve4_compute.xml.h
index 2f110f5..9a77466 100644
--- a/src/gallium/drivers/nvc0/nve4_compute.xml.h
+++ b/src/gallium/drivers/nvc0/nve4_compute.xml.h
@@ -199,6 +199,76 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define NVE4_COMPUTE_UNK260c					0x0000260c
 
+#define NVE4_COMPUTE_MP_PM_SET(i0)			       (0x0000335c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_SET__ESIZE				0x00000004
+#define NVE4_COMPUTE_MP_PM_SET__LEN				0x00000008
+
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL(i0)			       (0x0000337c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL__ESIZE			0x00000004
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL__LEN			0x00000004
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_NONE			0x00000000
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_USER			0x00000001
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_LAUNCH			0x00000003
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_EXEC			0x00000004
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_ISSUE			0x00000005
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_LDST			0x0000001b
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_BRANCH			0x0000001c
+
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL(i0)			       (0x0000338c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL__ESIZE			0x00000004
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL__LEN			0x00000004
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_NONE			0x00000000
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_WARP			0x00000002
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_L1				0x00000010
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_MEM				0x00000011
+
+#define NVE4_COMPUTE_MP_PM_SRCSEL(i0)			       (0x0000339c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_SRCSEL__ESIZE			0x00000004
+#define NVE4_COMPUTE_MP_PM_SRCSEL__LEN				0x00000008
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP0__MASK			0x00000003
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP0__SHIFT			0
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG0__MASK			0x0000001c
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG0__SHIFT			2
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP1__MASK			0x00000060
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP1__SHIFT			5
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG1__MASK			0x00000380
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG1__SHIFT			7
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP2__MASK			0x00000c00
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP2__SHIFT			10
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG2__MASK			0x00007000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG2__SHIFT			12
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP3__MASK			0x00018000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP3__SHIFT			15
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG3__MASK			0x000e0000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG3__SHIFT			17
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP4__MASK			0x00300000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP4__SHIFT			20
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG4__MASK			0x01c00000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG4__SHIFT			22
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP5__MASK			0x06000000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP5__SHIFT			25
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG5__MASK			0x38000000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG5__SHIFT			27
+
+#define NVE4_COMPUTE_MP_PM_FUNC(i0)			       (0x000033bc + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_FUNC__ESIZE				0x00000004
+#define NVE4_COMPUTE_MP_PM_FUNC__LEN				0x00000008
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE__MASK			0x0000000f
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE__SHIFT			0
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP			0x00000000
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_PULSE		0x00000001
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_B6				0x00000002
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK3			0x00000003
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_B6			0x00000004
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_B6_PULSE		0x00000005
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK6			0x00000006
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK7			0x00000007
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK8			0x00000008
+#define NVE4_COMPUTE_MP_PM_FUNC_FUNC__MASK			0x000ffff0
+#define NVE4_COMPUTE_MP_PM_FUNC_FUNC__SHIFT			4
+
+#define NVE4_COMPUTE_MP_PM_UNK33DC				0x000033dc
+
 #define NVE4_COMPUTE_LAUNCH_DESC__SIZE				0x00000100
 #define NVE4_COMPUTE_LAUNCH_DESC_6				0x00000018
 #define NVE4_COMPUTE_LAUNCH_DESC_6_NOTIFY__MASK			0x00000c00




More information about the mesa-commit mailing list