Mesa (master): nvc0: support MP performance counters on Maxwell

Thu Nov 10 21:14:49 UTC 2016

Module: Mesa
Branch: master
Commit: 561f2208bd6a952554e3bb043f6ff483a824a41c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=561f2208bd6a952554e3bb043f6ff483a824a41c

Author: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Date:   Sat Nov  5 17:23:33 2016 +0100

nvc0: support MP performance counters on Maxwell

This adds some performance counters/metrics for SM50/SM52.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Tested-by: Pierre Moreau <pierre.morrow at free.fr>

---

 .../drivers/nouveau/nvc0/nvc0_query_hw_metric.c    |  26 +-
 .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c        | 685 ++++++++++++++++++++-
 .../drivers/nouveau/nvc0/nvc0_query_hw_sm.h        |  13 +-
 3 files changed, 721 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
index 0e2d89f..089af61 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
@@ -376,6 +376,22 @@ static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] =
    &sm35_warp_nonpred_execution_efficiency,
 };
 
+/* ==== Compute capability 5.0 (GM107/GM108) ==== */
+static const struct nvc0_hw_metric_query_cfg *sm50_hw_metric_queries[] =
+{
+   &sm20_achieved_occupancy,
+   &sm20_branch_efficiency,
+   &sm30_inst_issued,
+   &sm20_inst_per_wrap,
+   &sm30_inst_replay_overhead,
+   &sm20_ipc,
+   &sm30_issued_ipc,
+   &sm30_issue_slots,
+   &sm30_issue_slot_utilization,
+   &sm30_warp_execution_efficiency,
+   &sm35_warp_nonpred_execution_efficiency,
+};
+
 #undef _SM
 
 static inline const struct nvc0_hw_metric_query_cfg **
@@ -384,6 +400,9 @@ nvc0_hw_metric_get_queries(struct nvc0_screen *screen)
    struct nouveau_device *dev = screen->base.device;
 
    switch (screen->base.class_3d) {
+   case GM200_3D_CLASS:
+   case GM107_3D_CLASS:
+      return sm50_hw_metric_queries;
    case NVF0_3D_CLASS:
       return sm35_hw_metric_queries;
    case NVE4_3D_CLASS:
@@ -403,6 +422,9 @@ nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen)
    struct nouveau_device *dev = screen->base.device;
 
    switch (screen->base.class_3d) {
+   case GM200_3D_CLASS:
+   case GM107_3D_CLASS:
+      return ARRAY_SIZE(sm50_hw_metric_queries);
    case NVF0_3D_CLASS:
       return ARRAY_SIZE(sm35_hw_metric_queries);
    case NVE4_3D_CLASS:
@@ -660,6 +682,8 @@ nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
    }
 
    switch (screen->base.class_3d) {
+   case GM200_3D_CLASS:
+   case GM107_3D_CLASS:
    case NVF0_3D_CLASS:
       value = sm35_hw_metric_calc_result(hq, res64);
       break;
@@ -734,7 +758,7 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
 
    if (id < count) {
       if (screen->compute) {
-         if (screen->base.class_3d <= NVF0_3D_CLASS) {
+         if (screen->base.class_3d <= GM200_3D_CLASS) {
             const struct nvc0_hw_metric_query_cfg **queries =
                nvc0_hw_metric_get_queries(screen);
             const struct nvc0_hw_metric_cfg *cfg =
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
index 2136368..dae1721 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -37,6 +37,12 @@ static const struct {
    const char *name;
    const char *desc;
 } nvc0_hw_sm_queries[] = {
+   _Q(ACTIVE_CTAS,
+      "active_ctas",
+      "Accumulated number of active blocks per cycle. For every cycle it "
+      "increments by the number of active blocks in the cycle which can be in "
+      "the range 0 to 32."),
+
    _Q(ACTIVE_CYCLES,
       "active_cycles",
       "Number of cycles a multiprocessor has at least one active warp"),
@@ -81,6 +87,20 @@ static const struct {
       "128 bytes. For each extra cache line access the counter is incremented "
       "by 1"),
 
+   _Q(GLOBAL_ATOM_CAS,
+      "global_atom_cas",
+      "Number of ATOM.CAS instructions executed per warp."),
+
+   _Q(GLOBAL_LD,
+      "global_load",
+      "Number of executed load instructions where state space is specified as "
+      "global, increments per warp on a multiprocessor."),
+
+   _Q(GLOBAL_ST,
+      "global_store",
+      "Number of executed store instructions where state space is specified as "
+      "global, increments per warp on a multiprocessor."),
+
    _Q(GST_TRANSACTIONS,
       "global_store_transaction",
       "Number of global store transactions. Increments by 1 per transaction. "
@@ -114,6 +134,11 @@ static const struct {
       "inst_issued",
       "Number of instructions issued including replays"),
 
+   _Q(INST_ISSUED0,
+      "inst_issued0",
+      "Number of cycles that did not issue any instruction, increments per "
+      "warp."),
+
    _Q(INST_ISSUED1,
       "inst_issued1",
       "Number of single instruction issued per cycle"),
@@ -260,11 +285,24 @@ static const struct {
       "User profiled generic trigger that can be inserted in any place of the "
       "code to collect the related information. Increments per warp."),
 
+   _Q(SHARED_ATOM,
+      "shared_atom",
+      "Number of ATOMS instructions executed per warp."),
+
+   _Q(SHARED_ATOM_CAS,
+      "shared_atom_cas",
+      "Number of ATOMS.CAS instructions executed per warp."),
+
    _Q(SHARED_LD,
       "shared_load",
       "Number of executed load instructions where state space is specified as "
       "shared, increments per warp on a multiprocessor"),
 
+   _Q(SHARED_LD_BANK_CONFLICT,
+      "shared_load_bank_conflict",
+      "Number of shared load bank conflict generated when the addresses for "
+      "two or more shared memory load requests fall in the same memory bank."),
+
    _Q(SHARED_LD_REPLAY,
       "shared_load_replay",
       "Replays caused due to shared load bank conflict (when the addresses for "
@@ -273,11 +311,23 @@ static const struct {
       "threads in the warp executing that instruction exceed the number of words "
       "that can be loaded in one cycle (256 bytes)"),
 
+   _Q(SHARED_LD_TRANSACTIONS,
+      "shared_ld_transactions",
+      "Number of transactions for shared load accesses. Maximum transaction "
+      "size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
+      "will cause multiple transactions for a shared load instruction. This "
+      "also includes extra transactions caused by shared bank conflicts."),
+
    _Q(SHARED_ST,
       "shared_store",
       "Number of executed store instructions where state space is specified as "
       "shared, increments per warp on a multiprocessor"),
 
+   _Q(SHARED_ST_BANK_CONFLICT,
+      "shared_store_bank_conflict",
+      "Number of shared store bank conflict generated when the addresses for "
+      "two or more shared memory store requests fall in the same memory bank."),
+
    _Q(SHARED_ST_REPLAY,
       "shared_store_replay",
       "Replays caused due to shared store bank conflict (when the addresses for "
@@ -286,6 +336,13 @@ static const struct {
       "threads in the warp executing that instruction exceed the number of words "
       "that can be stored in one cycle"),
 
+   _Q(SHARED_ST_TRANSACTIONS,
+      "shared_st_transactions",
+      "Number of transactions for shared store accesses. Maximum transaction "
+      "size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
+      "will cause multiple transactions for a shared store instruction. This "
+      "also includes extra transactions caused by shared bank conflicts."),
+
    _Q(SM_CTA_LAUNCHED,
       "sm_cta_launched",
       "Number of thread blocks launched on a multiprocessor"),
@@ -480,6 +537,54 @@ static const uint64_t nvf0_read_hw_sm_counters_code[] =
    0x18000000001c003cULL,
 };
 
+static const uint64_t gm107_read_hw_sm_counters_code[] =
+{
+   0x001f8000fc0007e0ULL,  /* sched (st 0x0) (st 0x0) (st 0x0)       */
+   0xf0c8000002170008ULL,  /* mov $r8 $tidx                          */
+   0xf0c800000037000cULL,  /* mov $r12 $virtid                       */
+   0xf0c8000000470000ULL,  /* mov $r0 $pm0                           */
+   0x001f8000fc0007e0ULL,  /* sched (st 0x0) (st 0x0) (st 0x0)       */
+   0xf0c8000000570001ULL,  /* mov $r1 $pm1                           */
+   0xf0c8000000670002ULL,  /* mov $r2 $pm2                           */
+   0xf0c8000000770003ULL,  /* mov $r3 $pm3                           */
+   0x001f8000fc0007e0ULL,  /* sched (st 0x0) (st 0x0) (st 0x0)       */
+   0xf0c8000000870004ULL,  /* mov $r4 $pm4                           */
+   0xf0c8000000970005ULL,  /* mov $r5 $pm5                           */
+   0xf0c8000000a70006ULL,  /* mov $r6 $pm6                           */
+   0x001f8000fc0007e0ULL,  /* sched (st 0x0) (st 0x0) (st 0x0)       */
+   0xf0c8000000b70007ULL,  /* mov $r7 $pm7                           */
+   0x5b6403800087ff07ULL,  /* isetp eq u32 and $p0 0x1 0x0 $r8 0x1   */
+   0x4c98079c1887000aULL,  /* mov $r10 c7[0x620] 0xf                 */
+   0x001f8000fc0007e0ULL,  /* sched (st 0x0) (st 0x0) (st 0x0)       */
+   0x3800000091470c08ULL,  /* bfe u32 $r8 $r12 0x914                 */
+   0x4c98079c1897000bULL,  /* mov $r11 c7[0x624] 0xf                 */
+   0x3800000020870c09ULL,  /* bfe u32 $r9 $r12 0x208                 */
+   0x001f8000fc0007e0ULL,  /* sched (st 0x0) (st 0x0) (st 0x0)       */
+   0xe30000000008000fULL,  /* not $p0 exit                           */
+   0x5b6403800097ff0fULL,  /* isetp eq u32 and $p1 0x1 0x0 $r9 0x1   */
+   0x3838000006070808ULL,  /* imul u32 u32 $r8 $r8 0x60              */
+   0x001f8000fc0007e0ULL,  /* sched (st 0x0) (st 0x0) (st 0x0)       */
+   0x383800000107090cULL,  /* imul u32 u32 $r12 $r9 0x10             */
+   0x383800000047090dULL,  /* imul u32 u32 $r13 $r9 0x4              */
+   0x5c10000000d70809ULL,  /* iadd $r9 $r8 $r13                      */
+   0x001f8000fc0007e0ULL,  /* sched (st 0x0) (st 0x0) (st 0x0)       */
+   0x5c10000000c70808ULL,  /* iadd $r8 $r8 $r12                      */
+   0x5c98078000a7000cULL,  /* mov $r12 $r10 0xf                      */
+   0x5c10800000870a0aULL,  /* iadd cc $r10 $r10 $r8                  */
+   0x001f8000fc0007e0ULL,  /* sched (st 0x0) (st 0x0) (st 0x0)       */
+   0x5c98078000b7000dULL,  /* mov $r13 $r11 0xf                      */
+   0x5c1008000ff70b0bULL,  /* iadd x $r11 $r11 0x0                   */
+   0x5c10800000970c0cULL,  /* iadd cc $r12 $r12 $r9                  */
+   0x001f8000fc0007e0ULL,  /* sched (st 0x0) (st 0x0) (st 0x0)       */
+   0x5c1008000ff70d0dULL,  /* iadd x $r13 $r13 0x0                   */
+   0xbfd0000000070a00ULL,  /* st e wt b128 g[$r10] $r0 0x1           */
+   0x4c98079c18a70000ULL,  /* mov $r0 c7[0x628] 0xf                  */
+   0x001f8000fc0007e0ULL,  /* sched (st 0x0) (st 0x0) (st 0x0)       */
+   0xbfd0000004010c04ULL,  /* $p1 st e wt b128 g[$r12+0x40] $r4 0x1  */
+   0xbf90000005070c00ULL,  /* st e wt b32 g[$r12+0x50] $r0 0x1       */
+   0xe30000000007000fULL,  /* exit                                   */
+};
+
 /* For simplicity, we will allocate as many group slots as we allocate counter
  * slots. This means that a single counter which wants to source from 2 groups
  * will have to be declared as using 2 counter slots. This shouldn't really be
@@ -1082,6 +1187,556 @@ static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] =
    &sm30_warps_launched,
 };
 
+/* ==== Compute capability 5.0 (GM107/GM108) ==== */
+static const struct nvc0_hw_sm_query_cfg
+sm50_active_ctas =
+{
+   .type         = NVC0_HW_SM_QUERY_ACTIVE_CTAS,
+   .ctr[0]       = _CB(0x003f, B6, 0x01, 0x29062080),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_active_cycles =
+{
+   .type         = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
+   .ctr[0]       = _CB(0x0001, B6, 0x00, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_active_warps =
+{
+   .type         = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
+   .ctr[0]       = _CB(0x003f, B6, 0x00, 0x398a4188),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_atom_count =
+{
+   .type         = NVC0_HW_SM_QUERY_ATOM_COUNT,
+   .ctr[0]       = _CA(0x0001, B6, 0x14, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_branch =
+{
+   .type         = NVC0_HW_SM_QUERY_BRANCH,
+   .ctr[0]       = _CA(0x0001, B6, 0x1a, 0x00000010),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_divergent_branch =
+{
+   .type         = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
+   .ctr[0]       = _CA(0x0001, B6, 0x1a, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_global_atom_cas =
+{
+   .type         = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
+   .ctr[0]       = _CA(0x0001, B6, 0x14, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_global_ld =
+{
+   .type         = NVC0_HW_SM_QUERY_GLOBAL_LD,
+   .ctr[0]       = _CA(0x0001, B6, 0x14, 0x0000000c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_global_st =
+{
+   .type         = NVC0_HW_SM_QUERY_GLOBAL_ST,
+   .ctr[0]       = _CA(0x0001, B6, 0x14, 0x00000010),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_gred_count =
+{
+   .type         = NVC0_HW_SM_QUERY_GRED_COUNT,
+   .ctr[0]       = _CA(0x0001, B6, 0x14, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_inst_executed =
+{
+   .type         = NVC0_HW_SM_QUERY_INST_EXECUTED,
+   .ctr[0]       = _CA(0x0003, B6, 0x02, 0x00000398),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_inst_issued0 =
+{
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED0,
+   .ctr[0]       = _CA(0x0001, B6, 0x02, 0x0000000c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_inst_issued1 =
+{
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED1,
+   .ctr[0]       = _CA(0x0001, B6, 0x02, 0x00000010),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_inst_issued2 =
+{
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED2,
+   .ctr[0]       = _CA(0x0001, B6, 0x02, 0x00000014),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_local_ld =
+{
+   .type         = NVC0_HW_SM_QUERY_LOCAL_LD,
+   .ctr[0]       = _CA(0x0001, B6, 0x13, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_local_st =
+{
+   .type         = NVC0_HW_SM_QUERY_LOCAL_ST,
+   .ctr[0]       = _CA(0x0001, B6, 0x13, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_not_pred_off_inst_executed =
+{
+   .type         = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
+   .ctr[0]       = _CA(0x003f, B6, 0x05, 0x29062080),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_0 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
+   .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_1 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
+   .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_2 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
+   .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_3 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
+   .ctr[0]       = _CA(0x0001, B6, 0x00, 0x0000000c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_4 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
+   .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000010),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_5 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
+   .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000014),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_6 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
+   .ctr[0]       = _CA(0x0001, B6, 0x00, 0x00000018),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_7 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
+   .ctr[0]       = _CA(0x0001, B6, 0x00, 0x0000001c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_atom =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_ATOM,
+   .ctr[0]       = _CA(0x0001, B6, 0x13, 0x00000014),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_atom_cas =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
+   .ctr[0]       = _CA(0x0001, B6, 0x13, 0x00000010),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_ld =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_LD,
+   .ctr[0]       = _CA(0x0001, B6, 0x13, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_ld_bank_conflict =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_LD_BANK_CONFLICT,
+   .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_ld_transactions =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_LD_TRANSACTIONS,
+   .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_st =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_ST,
+   .ctr[0]       = _CA(0x0001, B6, 0x13, 0x0000000c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_st_bank_conflict =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_ST_BANK_CONFLICT,
+   .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_st_transactions =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_ST_TRANSACTIONS,
+   .ctr[0]       = _CB(0x0001, B6, 0x0e, 0x0000000c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_sm_cta_launched =
+{
+   .type         = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
+   .ctr[0]       = _CB(0x0001, B6, 0x01, 0x00000018),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_th_inst_executed =
+{
+   .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
+   .ctr[0]       = _CA(0x003f, B6, 0x04, 0x29062080),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_warps_launched =
+{
+   .type         = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
+   .ctr[0]       = _CA(0x0001, B6, 0x02, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg *sm50_hw_sm_queries[] =
+{
+   &sm50_active_ctas,
+   &sm50_active_cycles,
+   &sm50_active_warps,
+   &sm50_atom_count,
+   &sm50_branch,
+   &sm50_divergent_branch,
+   &sm50_global_atom_cas,
+   &sm50_global_ld,
+   &sm50_global_st,
+   &sm50_gred_count,
+   &sm50_inst_executed,
+   &sm50_inst_issued0,
+   &sm50_inst_issued1,
+   &sm50_inst_issued2,
+   &sm50_local_ld,
+   &sm50_local_st,
+   &sm50_not_pred_off_inst_executed,
+   &sm50_prof_trigger_0,
+   &sm50_prof_trigger_1,
+   &sm50_prof_trigger_2,
+   &sm50_prof_trigger_3,
+   &sm50_prof_trigger_4,
+   &sm50_prof_trigger_5,
+   &sm50_prof_trigger_6,
+   &sm50_prof_trigger_7,
+   &sm50_shared_atom,
+   &sm50_shared_atom_cas,
+   &sm50_shared_ld,
+   &sm50_shared_ld_bank_conflict,
+   &sm50_shared_ld_transactions,
+   &sm50_shared_st,
+   &sm50_shared_st_bank_conflict,
+   &sm50_shared_st_transactions,
+   &sm50_sm_cta_launched,
+   &sm50_th_inst_executed,
+   &sm50_warps_launched,
+};
+
+/* ==== Compute capability 5.2 (GM200/GM204/GM206) ==== */
+static const struct nvc0_hw_sm_query_cfg
+sm52_atom_count =
+{
+   .type         = NVC0_HW_SM_QUERY_ATOM_COUNT,
+   .ctr[0]       = _CA(0x0001, B6, 0x0a, 0x0000001c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_global_atom_cas =
+{
+   .type         = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
+   .ctr[0]       = _CA(0x0001, B6, 0x0a, 0x00000018),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_global_ld =
+{
+   .type         = NVC0_HW_SM_QUERY_GLOBAL_LD,
+   .ctr[0]       = _CA(0x0001, B6, 0x0b, 0x00000018),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_global_st =
+{
+   .type         = NVC0_HW_SM_QUERY_GLOBAL_ST,
+   .ctr[0]       = _CA(0x0001, B6, 0x0b, 0x0000001c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_gred_count =
+{
+   .type         = NVC0_HW_SM_QUERY_GRED_COUNT,
+   .ctr[0]       = _CA(0x0001, B6, 0x0f, 0x00000018),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_inst_executed =
+{
+   .type         = NVC0_HW_SM_QUERY_INST_EXECUTED,
+   .ctr[0]       = _CA(0x0003, B6, 0x03, 0x0000020c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_inst_issued0 =
+{
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED0,
+   .ctr[0]       = _CA(0x0001, B6, 0x03, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_inst_issued1 =
+{
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED1,
+   .ctr[0]       = _CA(0x0001, B6, 0x03, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_inst_issued2 =
+{
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED2,
+   .ctr[0]       = _CA(0x0001, B6, 0x03, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_local_ld =
+{
+   .type         = NVC0_HW_SM_QUERY_LOCAL_LD,
+   .ctr[0]       = _CA(0x0001, B6, 0x06, 0x0000001c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_local_st =
+{
+   .type         = NVC0_HW_SM_QUERY_LOCAL_ST,
+   .ctr[0]       = _CA(0x0001, B6, 0x06, 0x00000018),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_shared_atom =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_ATOM,
+   .ctr[0]       = _CA(0x0001, B6, 0x08, 0x0000001c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_shared_atom_cas =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
+   .ctr[0]       = _CA(0x0001, B6, 0x08, 0x00000018),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_shared_ld =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_LD,
+   .ctr[0]       = _CA(0x0001, B6, 0x07, 0x00000018),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_shared_st =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_ST,
+   .ctr[0]       = _CA(0x0001, B6, 0x07, 0x0000001c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_warps_launched =
+{
+   .type         = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
+   .ctr[0]       = _CA(0x0001, B6, 0x02, 0x0000001c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg *sm52_hw_sm_queries[] =
+{
+   &sm50_active_ctas,
+   &sm50_active_cycles,
+   &sm50_active_warps,
+   &sm52_atom_count,
+   &sm50_branch,
+   &sm50_divergent_branch,
+   &sm52_global_atom_cas,
+   &sm52_global_ld,
+   &sm52_global_st,
+   &sm52_gred_count,
+   &sm52_inst_executed,
+   &sm52_inst_issued0,
+   &sm52_inst_issued1,
+   &sm52_inst_issued2,
+   &sm52_local_ld,
+   &sm52_local_st,
+   &sm50_not_pred_off_inst_executed,
+   &sm50_prof_trigger_0,
+   &sm50_prof_trigger_1,
+   &sm50_prof_trigger_2,
+   &sm50_prof_trigger_3,
+   &sm50_prof_trigger_4,
+   &sm50_prof_trigger_5,
+   &sm50_prof_trigger_6,
+   &sm50_prof_trigger_7,
+   &sm52_shared_atom,
+   &sm52_shared_atom_cas,
+   &sm52_shared_ld,
+   &sm50_shared_ld_bank_conflict,
+   &sm50_shared_ld_transactions,
+   &sm52_shared_st,
+   &sm50_shared_st_bank_conflict,
+   &sm50_shared_st_transactions,
+   &sm50_sm_cta_launched,
+   &sm50_th_inst_executed,
+   &sm52_warps_launched,
+};
+
 #undef _Q
 #undef _CA
 #undef _CB
@@ -1580,6 +2235,10 @@ nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
    struct nouveau_device *dev = screen->base.device;
 
    switch (screen->base.class_3d) {
+   case GM200_3D_CLASS:
+      return sm52_hw_sm_queries;
+   case GM107_3D_CLASS:
+      return sm50_hw_sm_queries;
    case NVF0_3D_CLASS:
       return sm35_hw_sm_queries;
    case NVE4_3D_CLASS:
@@ -1599,6 +2258,10 @@ nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen)
    struct nouveau_device *dev = screen->base.device;
 
    switch (screen->base.class_3d) {
+   case GM200_3D_CLASS:
+      return ARRAY_SIZE(sm52_hw_sm_queries);
+   case GM107_3D_CLASS:
+      return ARRAY_SIZE(sm50_hw_sm_queries);
    case NVF0_3D_CLASS:
       return ARRAY_SIZE(sm35_hw_sm_queries);
    case NVE4_3D_CLASS:
@@ -1710,6 +2373,18 @@ nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
      BEGIN_NVC0(push, NVE4_CP(MP_PM_SET(c)), 1);
      PUSH_DATA (push, 0);
    }
+
+   if (screen->base.class_3d >= GM107_3D_CLASS) {
+      /* Enable mask for counters, it's 8-bits value where 0:3 is for domain A
+       * and 4:7 for domain B. For example, the mask for active_warps should be
+       * 0x70 because it uses 3 counters in domain B. However, let's always
+       * enable all counters because we don't want to track which ones is
+       * enabled or not, and this allows to monitor multiple queries at the
+       * same time. */
+      BEGIN_NVC0(push, SUBC_CP(0x33e0), 1);
+      PUSH_DATA (push, 0xff);
+   }
+
    return true;
 }
 
@@ -1795,6 +2470,11 @@ nvc0_hw_sm_get_program(struct nvc0_screen *screen)
    prog->translated = true;
    prog->parm_size = 12;
 
+   if (screen->base.class_3d >= GM107_3D_CLASS) {
+      prog->code = (uint32_t *)gm107_read_hw_sm_counters_code;
+      prog->code_size = sizeof(gm107_read_hw_sm_counters_code);
+      prog->num_gprs = 14;
+   } else
    if (screen->base.class_3d == NVE4_3D_CLASS ||
        screen->base.class_3d == NVF0_3D_CLASS) {
       if (screen->base.class_3d == NVE4_3D_CLASS) {
@@ -1885,6 +2565,9 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
       }
    }
 
+   if (screen->base.class_3d >= GM107_3D_CLASS)
+      IMMED_NVC0(push, SUBC_CP(0x33e0), 0);
+
    BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
                 hq->bo);
 
@@ -2121,7 +2804,7 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
 
    if (id < count) {
       if (screen->compute) {
-         if (screen->base.class_3d <= NVF0_3D_CLASS) {
+         if (screen->base.class_3d <= GM200_3D_CLASS) {
             const struct nvc0_hw_sm_query_cfg **queries =
                nvc0_hw_sm_get_queries(screen);
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h
index 65d6c8b..02f75cf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h
@@ -21,7 +21,8 @@ nvc0_hw_sm_query(struct nvc0_hw_query *hq)
 #define NVC0_HW_SM_QUERY_LAST   NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1)
 enum nvc0_hw_sm_queries
 {
-   NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0,
+   NVC0_HW_SM_QUERY_ACTIVE_CTAS = 0,
+   NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
    NVC0_HW_SM_QUERY_ACTIVE_WARPS,
    NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
    NVC0_HW_SM_QUERY_ATOM_COUNT,
@@ -29,12 +30,16 @@ enum nvc0_hw_sm_queries
    NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
    NVC0_HW_SM_QUERY_GLD_REQUEST,
    NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
+   NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
+   NVC0_HW_SM_QUERY_GLOBAL_LD,
+   NVC0_HW_SM_QUERY_GLOBAL_ST,
    NVC0_HW_SM_QUERY_GST_TRANSACTIONS,
    NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
    NVC0_HW_SM_QUERY_GRED_COUNT,
    NVC0_HW_SM_QUERY_GST_REQUEST,
    NVC0_HW_SM_QUERY_INST_EXECUTED,
    NVC0_HW_SM_QUERY_INST_ISSUED,
+   NVC0_HW_SM_QUERY_INST_ISSUED0,
    NVC0_HW_SM_QUERY_INST_ISSUED1,
    NVC0_HW_SM_QUERY_INST_ISSUED2,
    NVC0_HW_SM_QUERY_INST_ISSUED1_0,
@@ -64,10 +69,16 @@ enum nvc0_hw_sm_queries
    NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
    NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
    NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
+   NVC0_HW_SM_QUERY_SHARED_ATOM,
+   NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
    NVC0_HW_SM_QUERY_SHARED_LD,
+   NVC0_HW_SM_QUERY_SHARED_LD_BANK_CONFLICT,
    NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
+   NVC0_HW_SM_QUERY_SHARED_LD_TRANSACTIONS,
    NVC0_HW_SM_QUERY_SHARED_ST,
+   NVC0_HW_SM_QUERY_SHARED_ST_BANK_CONFLICT,
    NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
+   NVC0_HW_SM_QUERY_SHARED_ST_TRANSACTIONS,
    NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
    NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
    NVC0_HW_SM_QUERY_TH_INST_EXECUTED,