[Mesa-dev] [PATCH] nvc0: support MP performance counters on Maxwell
Samuel Pitoiset
samuel.pitoiset at gmail.com
Thu Nov 10 19:25:12 UTC 2016
This adds some performance counters/metrics for SM50/SM52.
Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
---
.../drivers/nouveau/nvc0/nvc0_query_hw_metric.c | 26 +-
.../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 740 ++++++++++++++++++++-
.../drivers/nouveau/nvc0/nvc0_query_hw_sm.h | 13 +-
3 files changed, 775 insertions(+), 4 deletions(-)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
index 0e2d89f..089af61 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
@@ -376,6 +376,22 @@ static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] =
&sm35_warp_nonpred_execution_efficiency,
};
+/* ==== Compute capability 5.0 (GM107/GM108) ==== */
+static const struct nvc0_hw_metric_query_cfg *sm50_hw_metric_queries[] =
+{
+ &sm20_achieved_occupancy,
+ &sm20_branch_efficiency,
+ &sm30_inst_issued,
+ &sm20_inst_per_wrap,
+ &sm30_inst_replay_overhead,
+ &sm20_ipc,
+ &sm30_issued_ipc,
+ &sm30_issue_slots,
+ &sm30_issue_slot_utilization,
+ &sm30_warp_execution_efficiency,
+ &sm35_warp_nonpred_execution_efficiency,
+};
+
#undef _SM
static inline const struct nvc0_hw_metric_query_cfg **
@@ -384,6 +400,9 @@ nvc0_hw_metric_get_queries(struct nvc0_screen *screen)
struct nouveau_device *dev = screen->base.device;
switch (screen->base.class_3d) {
+ case GM200_3D_CLASS:
+ case GM107_3D_CLASS:
+ return sm50_hw_metric_queries;
case NVF0_3D_CLASS:
return sm35_hw_metric_queries;
case NVE4_3D_CLASS:
@@ -403,6 +422,9 @@ nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen)
struct nouveau_device *dev = screen->base.device;
switch (screen->base.class_3d) {
+ case GM200_3D_CLASS:
+ case GM107_3D_CLASS:
+ return ARRAY_SIZE(sm50_hw_metric_queries);
case NVF0_3D_CLASS:
return ARRAY_SIZE(sm35_hw_metric_queries);
case NVE4_3D_CLASS:
@@ -660,6 +682,8 @@ nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
}
switch (screen->base.class_3d) {
+ case GM200_3D_CLASS:
+ case GM107_3D_CLASS:
case NVF0_3D_CLASS:
value = sm35_hw_metric_calc_result(hq, res64);
break;
@@ -734,7 +758,7 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
if (id < count) {
if (screen->compute) {
- if (screen->base.class_3d <= NVF0_3D_CLASS) {
+ if (screen->base.class_3d <= GM200_3D_CLASS) {
const struct nvc0_hw_metric_query_cfg **queries =
nvc0_hw_metric_get_queries(screen);
const struct nvc0_hw_metric_cfg *cfg =
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
index 2136368..46cbb62 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -37,6 +37,12 @@ static const struct {
const char *name;
const char *desc;
} nvc0_hw_sm_queries[] = {
+ _Q(ACTIVE_CTAS,
+ "active_ctas",
+ "Accumulated number of active blocks per cycle. For every cycle it "
+ "increments by the number of active blocks in the cycle which can be in "
+ "the range 0 to 32."),
+
_Q(ACTIVE_CYCLES,
"active_cycles",
"Number of cycles a multiprocessor has at least one active warp"),
@@ -81,6 +87,20 @@ static const struct {
"128 bytes. For each extra cache line access the counter is incremented "
"by 1"),
+ _Q(GLOBAL_ATOM_CAS,
+ "global_atom_cas",
+ "Number of ATOM.CAS instructions executed per warp."),
+
+ _Q(GLOBAL_LD,
+ "global_load",
+ "Number of executed load instructions where state space is specified as "
+ "global, increments per warp on a multiprocessor."),
+
+ _Q(GLOBAL_ST,
+ "global_store",
+ "Number of executed store instructions where state space is specified as "
+ "global, increments per warp on a multiprocessor."),
+
_Q(GST_TRANSACTIONS,
"global_store_transaction",
"Number of global store transactions. Increments by 1 per transaction. "
@@ -114,6 +134,11 @@ static const struct {
"inst_issued",
"Number of instructions issued including replays"),
+ _Q(INST_ISSUED0,
+ "inst_issued0",
+ "Number of cycles that did not issue any instruction, increments per "
+ "warp."),
+
_Q(INST_ISSUED1,
"inst_issued1",
"Number of single instruction issued per cycle"),
@@ -260,11 +285,24 @@ static const struct {
"User profiled generic trigger that can be inserted in any place of the "
"code to collect the related information. Increments per warp."),
+ _Q(SHARED_ATOM,
+ "shared_atom",
+ "Number of ATOMS instructions executed per warp."),
+
+ _Q(SHARED_ATOM_CAS,
+ "shared_atom_cas",
+ "Number of ATOMS.CAS instructions executed per warp."),
+
_Q(SHARED_LD,
"shared_load",
"Number of executed load instructions where state space is specified as "
"shared, increments per warp on a multiprocessor"),
+ _Q(SHARED_LD_BANK_CONFLICT,
+ "shared_load_bank_conflict",
+ "Number of shared load bank conflict generated when the addresses for "
+ "two or more shared memory load requests fall in the same memory bank."),
+
_Q(SHARED_LD_REPLAY,
"shared_load_replay",
"Replays caused due to shared load bank conflict (when the addresses for "
@@ -273,11 +311,23 @@ static const struct {
"threads in the warp executing that instruction exceed the number of words "
"that can be loaded in one cycle (256 bytes)"),
+ _Q(SHARED_LD_TRANSACTIONS,
+ "shared_ld_transactions",
+ "Number of transactions for shared load accesses. Maximum transaction "
+ "size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
+ "will cause multiple transactions for a shared load instruction. This "
+ "also includes extra transactions caused by shared bank conflicts."),
+
_Q(SHARED_ST,
"shared_store",
"Number of executed store instructions where state space is specified as "
"shared, increments per warp on a multiprocessor"),
+ _Q(SHARED_ST_BANK_CONFLICT,
+ "shared_store_bank_conflict",
+ "Number of shared store bank conflict generated when the addresses for "
+ "two or more shared memory store requests fall in the same memory bank."),
+
_Q(SHARED_ST_REPLAY,
"shared_store_replay",
"Replays caused due to shared store bank conflict (when the addresses for "
@@ -286,6 +336,13 @@ static const struct {
"threads in the warp executing that instruction exceed the number of words "
"that can be stored in one cycle"),
+ _Q(SHARED_ST_TRANSACTIONS,
+ "shared_st_transactions",
+ "Number of transactions for shared store accesses. Maximum transaction "
+ "size in maxwell is 128 bytes, any warp accessing more that 128 bytes "
+ "will cause multiple transactions for a shared store instruction. This "
+ "also includes extra transactions caused by shared bank conflicts."),
+
_Q(SM_CTA_LAUNCHED,
"sm_cta_launched",
"Number of thread blocks launched on a multiprocessor"),
@@ -480,6 +537,54 @@ static const uint64_t nvf0_read_hw_sm_counters_code[] =
0x18000000001c003cULL,
};
+static const uint64_t gm107_read_hw_sm_counters_code[] =
+{
+ 0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
+ 0xf0c8000002170008ULL, /* mov $r8 $tidx */
+ 0xf0c800000037000cULL, /* mov $r12 $virtid */
+ 0xf0c8000000470000ULL, /* mov $r0 $pm0 */
+ 0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
+ 0xf0c8000000570001ULL, /* mov $r1 $pm1 */
+ 0xf0c8000000670002ULL, /* mov $r2 $pm2 */
+ 0xf0c8000000770003ULL, /* mov $r3 $pm3 */
+ 0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
+ 0xf0c8000000870004ULL, /* mov $r4 $pm4 */
+ 0xf0c8000000970005ULL, /* mov $r5 $pm5 */
+ 0xf0c8000000a70006ULL, /* mov $r6 $pm6 */
+ 0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
+ 0xf0c8000000b70007ULL, /* mov $r7 $pm7 */
+ 0x5b6403800087ff07ULL, /* isetp eq u32 and $p0 0x1 0x0 $r8 0x1 */
+ 0x4c98079c1887000aULL, /* mov $r10 c7[0x620] 0xf */
+ 0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
+ 0x3800000091470c08ULL, /* bfe u32 $r8 $r12 0x914 */
+ 0x4c98079c1897000bULL, /* mov $r11 c7[0x624] 0xf */
+ 0x3800000020870c09ULL, /* bfe u32 $r9 $r12 0x208 */
+ 0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
+ 0xe30000000008000fULL, /* not $p0 exit */
+ 0x5b6403800097ff0fULL, /* isetp eq u32 and $p1 0x1 0x0 $r9 0x1 */
+ 0x3838000006070808ULL, /* imul u32 u32 $r8 $r8 0x60 */
+ 0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
+ 0x383800000107090cULL, /* imul u32 u32 $r12 $r9 0x10 */
+ 0x383800000047090dULL, /* imul u32 u32 $r13 $r9 0x4 */
+ 0x5c10000000d70809ULL, /* iadd $r9 $r8 $r13 */
+ 0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
+ 0x5c10000000c70808ULL, /* iadd $r8 $r8 $r12 */
+ 0x5c98078000a7000cULL, /* mov $r12 $r10 0xf */
+ 0x5c10800000870a0aULL, /* iadd cc $r10 $r10 $r8 */
+ 0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
+ 0x5c98078000b7000dULL, /* mov $r13 $r11 0xf */
+ 0x5c1008000ff70b0bULL, /* iadd x $r11 $r11 0x0 */
+ 0x5c10800000970c0cULL, /* iadd cc $r12 $r12 $r9 */
+ 0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
+ 0x5c1008000ff70d0dULL, /* iadd x $r13 $r13 0x0 */
+ 0xbfd0000000070a00ULL, /* st e wt b128 g[$r10] $r0 0x1 */
+ 0x4c98079c18a70000ULL, /* mov $r0 c7[0x628] 0xf */
+ 0x001f8000fc0007e0ULL, /* sched (st 0x0) (st 0x0) (st 0x0) */
+ 0xbfd0000004010c04ULL, /* $p1 st e wt b128 g[$r12+0x40] $r4 0x1 */
+ 0xbf90000005070c00ULL, /* st e wt b32 g[$r12+0x50] $r0 0x1 */
+ 0xe30000000007000fULL, /* exit */
+};
+
/* For simplicity, we will allocate as many group slots as we allocate counter
* slots. This means that a single counter which wants to source from 2 groups
* will have to be declared as using 2 counter slots. This shouldn't really be
@@ -1082,6 +1187,556 @@ static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] =
&sm30_warps_launched,
};
+/* ==== Compute capability 5.0 (GM107/GM108) ==== */
+static const struct nvc0_hw_sm_query_cfg
+sm50_active_ctas =
+{
+ .type = NVC0_HW_SM_QUERY_ACTIVE_CTAS,
+ .ctr[0] = _CB(0x003f, B6, 0x01, 0x29062080),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_active_cycles =
+{
+ .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
+ .ctr[0] = _CB(0x0001, B6, 0x00, 0x00000004),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_active_warps =
+{
+ .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
+ .ctr[0] = _CB(0x003f, B6, 0x00, 0x398a4188),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_atom_count =
+{
+ .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
+ .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000004),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_branch =
+{
+ .type = NVC0_HW_SM_QUERY_BRANCH,
+ .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000010),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_divergent_branch =
+{
+ .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
+ .ctr[0] = _CA(0x0001, B6, 0x1a, 0x00000004),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_global_atom_cas =
+{
+ .type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
+ .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000000),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_global_ld =
+{
+ .type = NVC0_HW_SM_QUERY_GLOBAL_LD,
+ .ctr[0] = _CA(0x0001, B6, 0x14, 0x0000000c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_global_st =
+{
+ .type = NVC0_HW_SM_QUERY_GLOBAL_ST,
+ .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000010),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_gred_count =
+{
+ .type = NVC0_HW_SM_QUERY_GRED_COUNT,
+ .ctr[0] = _CA(0x0001, B6, 0x14, 0x00000008),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_inst_executed =
+{
+ .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
+ .ctr[0] = _CA(0x0003, B6, 0x02, 0x00000398),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_inst_issued0 =
+{
+ .type = NVC0_HW_SM_QUERY_INST_ISSUED0,
+ .ctr[0] = _CA(0x0001, B6, 0x02, 0x0000000c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_inst_issued1 =
+{
+ .type = NVC0_HW_SM_QUERY_INST_ISSUED1,
+ .ctr[0] = _CA(0x0001, B6, 0x02, 0x00000010),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_inst_issued2 =
+{
+ .type = NVC0_HW_SM_QUERY_INST_ISSUED2,
+ .ctr[0] = _CA(0x0001, B6, 0x02, 0x00000014),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_local_ld =
+{
+ .type = NVC0_HW_SM_QUERY_LOCAL_LD,
+ .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000004),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_local_st =
+{
+ .type = NVC0_HW_SM_QUERY_LOCAL_ST,
+ .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000000),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_not_pred_off_inst_executed =
+{
+ .type = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
+ .ctr[0] = _CA(0x003f, B6, 0x05, 0x29062080),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_0 =
+{
+ .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
+ .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000000),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_1 =
+{
+ .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
+ .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000004),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_2 =
+{
+ .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
+ .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000008),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_3 =
+{
+ .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
+ .ctr[0] = _CA(0x0001, B6, 0x00, 0x0000000c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_4 =
+{
+ .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
+ .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000010),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_5 =
+{
+ .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
+ .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000014),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_6 =
+{
+ .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
+ .ctr[0] = _CA(0x0001, B6, 0x00, 0x00000018),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_prof_trigger_7 =
+{
+ .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
+ .ctr[0] = _CA(0x0001, B6, 0x00, 0x0000001c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_atom =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_ATOM,
+ .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000014),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_atom_cas =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
+ .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000010),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_ld =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_LD,
+ .ctr[0] = _CA(0x0001, B6, 0x13, 0x00000008),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_ld_bank_conflict =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_LD_BANK_CONFLICT,
+ .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000000),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_ld_transactions =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_LD_TRANSACTIONS,
+ .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000008),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_st =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_ST,
+ .ctr[0] = _CA(0x0001, B6, 0x13, 0x0000000c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_st_bank_conflict =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_ST_BANK_CONFLICT,
+ .ctr[0] = _CB(0x0001, B6, 0x0e, 0x00000004),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_shared_st_transactions =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_ST_TRANSACTIONS,
+ .ctr[0] = _CB(0x0001, B6, 0x0e, 0x0000000c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_sm_cta_launched =
+{
+ .type = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
+ .ctr[0] = _CB(0x0001, B6, 0x01, 0x00000018),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_th_inst_executed =
+{
+ .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
+ .ctr[0] = _CA(0x003f, B6, 0x04, 0x29062080),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm50_warps_launched =
+{
+ .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
+ .ctr[0] = _CA(0x0001, B6, 0x02, 0x00000008),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg *sm50_hw_sm_queries[] =
+{
+ &sm50_active_ctas,
+ &sm50_active_cycles,
+ &sm50_active_warps,
+ &sm50_atom_count,
+ &sm50_branch,
+ &sm50_divergent_branch,
+ &sm50_global_atom_cas,
+ &sm50_global_ld,
+ &sm50_global_st,
+ &sm50_gred_count,
+ &sm50_inst_executed,
+ &sm50_inst_issued0,
+ &sm50_inst_issued1,
+ &sm50_inst_issued2,
+ &sm50_local_ld,
+ &sm50_local_st,
+ &sm50_not_pred_off_inst_executed,
+ &sm50_prof_trigger_0,
+ &sm50_prof_trigger_1,
+ &sm50_prof_trigger_2,
+ &sm50_prof_trigger_3,
+ &sm50_prof_trigger_4,
+ &sm50_prof_trigger_5,
+ &sm50_prof_trigger_6,
+ &sm50_prof_trigger_7,
+ &sm50_shared_atom,
+ &sm50_shared_atom_cas,
+ &sm50_shared_ld,
+ &sm50_shared_ld_bank_conflict,
+ &sm50_shared_ld_transactions,
+ &sm50_shared_st,
+ &sm50_shared_st_bank_conflict,
+ &sm50_shared_st_transactions,
+ &sm50_sm_cta_launched,
+ &sm50_th_inst_executed,
+ &sm50_warps_launched,
+};
+
+/* ==== Compute capability 5.2 (GM200/GM204/GM206) ==== */
+static const struct nvc0_hw_sm_query_cfg
+sm52_atom_count =
+{
+ .type = NVC0_HW_SM_QUERY_ATOM_COUNT,
+ .ctr[0] = _CA(0x0001, B6, 0x0a, 0x0000001c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_global_atom_cas =
+{
+ .type = NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
+ .ctr[0] = _CA(0x0001, B6, 0x0a, 0x00000018),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_global_ld =
+{
+ .type = NVC0_HW_SM_QUERY_GLOBAL_LD,
+ .ctr[0] = _CA(0x0001, B6, 0x0b, 0x00000018),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_global_st =
+{
+ .type = NVC0_HW_SM_QUERY_GLOBAL_ST,
+ .ctr[0] = _CA(0x0001, B6, 0x0b, 0x0000001c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_gred_count =
+{
+ .type = NVC0_HW_SM_QUERY_GRED_COUNT,
+ .ctr[0] = _CA(0x0001, B6, 0x0f, 0x00000018),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_inst_executed =
+{
+ .type = NVC0_HW_SM_QUERY_INST_EXECUTED,
+ .ctr[0] = _CA(0x0003, B6, 0x03, 0x0000020c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_inst_issued0 =
+{
+ .type = NVC0_HW_SM_QUERY_INST_ISSUED0,
+ .ctr[0] = _CA(0x0001, B6, 0x03, 0x00000000),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_inst_issued1 =
+{
+ .type = NVC0_HW_SM_QUERY_INST_ISSUED1,
+ .ctr[0] = _CA(0x0001, B6, 0x03, 0x00000004),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_inst_issued2 =
+{
+ .type = NVC0_HW_SM_QUERY_INST_ISSUED2,
+ .ctr[0] = _CA(0x0001, B6, 0x03, 0x00000008),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_local_ld =
+{
+ .type = NVC0_HW_SM_QUERY_LOCAL_LD,
+ .ctr[0] = _CA(0x0001, B6, 0x06, 0x0000001c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_local_st =
+{
+ .type = NVC0_HW_SM_QUERY_LOCAL_ST,
+ .ctr[0] = _CA(0x0001, B6, 0x06, 0x00000018),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_shared_atom =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_ATOM,
+ .ctr[0] = _CA(0x0001, B6, 0x08, 0x0000001c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_shared_atom_cas =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
+ .ctr[0] = _CA(0x0001, B6, 0x08, 0x00000018),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_shared_ld =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_LD,
+ .ctr[0] = _CA(0x0001, B6, 0x07, 0x00000018),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_shared_st =
+{
+ .type = NVC0_HW_SM_QUERY_SHARED_ST,
+ .ctr[0] = _CA(0x0001, B6, 0x07, 0x0000001c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm52_warps_launched =
+{
+ .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
+ .ctr[0] = _CA(0x0001, B6, 0x02, 0x0000001c),
+ .num_counters = 1,
+ .norm = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg *sm52_hw_sm_queries[] =
+{
+ &sm50_active_ctas,
+ &sm50_active_cycles,
+ &sm50_active_warps,
+ &sm52_atom_count,
+ &sm50_branch,
+ &sm50_divergent_branch,
+ &sm52_global_atom_cas,
+ &sm52_global_ld,
+ &sm52_global_st,
+ &sm52_gred_count,
+ &sm52_inst_executed,
+ &sm52_inst_issued0,
+ &sm52_inst_issued1,
+ &sm52_inst_issued2,
+ &sm52_local_ld,
+ &sm52_local_st,
+ &sm50_not_pred_off_inst_executed,
+ &sm50_prof_trigger_0,
+ &sm50_prof_trigger_1,
+ &sm50_prof_trigger_2,
+ &sm50_prof_trigger_3,
+ &sm50_prof_trigger_4,
+ &sm50_prof_trigger_5,
+ &sm50_prof_trigger_6,
+ &sm50_prof_trigger_7,
+ &sm52_shared_atom,
+ &sm52_shared_atom_cas,
+ &sm52_shared_ld,
+ &sm50_shared_ld_bank_conflict,
+ &sm50_shared_ld_transactions,
+ &sm52_shared_st,
+ &sm50_shared_st_bank_conflict,
+ &sm50_shared_st_transactions,
+ &sm50_sm_cta_launched,
+ &sm50_th_inst_executed,
+ &sm52_warps_launched,
+};
+
#undef _Q
#undef _CA
#undef _CB
@@ -1580,6 +2235,10 @@ nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
struct nouveau_device *dev = screen->base.device;
switch (screen->base.class_3d) {
+ case GM200_3D_CLASS:
+ return sm52_hw_sm_queries;
+ case GM107_3D_CLASS:
+ return sm50_hw_sm_queries;
case NVF0_3D_CLASS:
return sm35_hw_sm_queries;
case NVE4_3D_CLASS:
@@ -1599,6 +2258,10 @@ nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen)
struct nouveau_device *dev = screen->base.device;
switch (screen->base.class_3d) {
+ case GM200_3D_CLASS:
+ return ARRAY_SIZE(sm52_hw_sm_queries);
+ case GM107_3D_CLASS:
+ return ARRAY_SIZE(sm50_hw_sm_queries);
case NVF0_3D_CLASS:
return ARRAY_SIZE(sm35_hw_sm_queries);
case NVE4_3D_CLASS:
@@ -1710,6 +2373,18 @@ nve4_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
BEGIN_NVC0(push, NVE4_CP(MP_PM_SET(c)), 1);
PUSH_DATA (push, 0);
}
+
+ if (screen->base.class_3d >= GM107_3D_CLASS) {
+ /* Enable mask for counters, it's 8-bits value where 0:3 is for domain A
+ * and 4:7 for domain B. For example, the mask for active_warps should be
+ * 0x70 because it uses 3 counters in domain B. However, let's always
+ * enable all counters because we don't want to track which ones is
+ * enabled or not, and this allows to monitor multiple queries at the
+ * same time. */
+ BEGIN_NVC0(push, SUBC_CP(0x33e0), 1);
+ PUSH_DATA (push, 0xff);
+ }
+
return true;
}
@@ -1795,6 +2470,11 @@ nvc0_hw_sm_get_program(struct nvc0_screen *screen)
prog->translated = true;
prog->parm_size = 12;
+ if (screen->base.class_3d >= GM107_3D_CLASS) {
+ prog->code = (uint32_t *)gm107_read_hw_sm_counters_code;
+ prog->code_size = sizeof(gm107_read_hw_sm_counters_code);
+ prog->num_gprs = 14;
+ } else
if (screen->base.class_3d == NVE4_3D_CLASS ||
screen->base.class_3d == NVF0_3D_CLASS) {
if (screen->base.class_3d == NVE4_3D_CLASS) {
@@ -1885,6 +2565,9 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
}
}
+ if (screen->base.class_3d >= GM107_3D_CLASS)
+ IMMED_NVC0(push, SUBC_CP(0x33e0), 0);
+
BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
hq->bo);
@@ -1958,6 +2641,8 @@ nvc0_hw_sm_query_read_data(uint32_t count[32][8],
return true;
}
+//#define DEBUG_PM
+
static inline bool
nve4_hw_sm_query_read_data(uint32_t count[32][8],
struct nvc0_context *nvc0, bool wait,
@@ -1967,13 +2652,58 @@ nve4_hw_sm_query_read_data(uint32_t count[32][8],
{
struct nvc0_hw_sm_query *hsq = nvc0_hw_sm_query(hq);
unsigned p, c, d;
+ /* for each MP:
+ * [00] = WS0.C0
+ * [04] = WS0.C1
+ * [08] = WS0.C2
+ * [0c] = WS0.C3
+ * [10] = WS1.C0
+ * [14] = WS1.C1
+ * [18] = WS1.C2
+ * [1c] = WS1.C3
+ * [20] = WS2.C0
+ * [24] = WS2.C1
+ * [28] = WS2.C2
+ * [2c] = WS2.C3
+ * [30] = WS3.C0
+ * [34] = WS3.C1
+ * [38] = WS3.C2
+ * [3c] = WS3.C3
+ * [40] = MP.C4
+ * [44] = MP.C5
+ * [48] = MP.C6
+ * [4c] = MP.C7
+ * [50] = WS0.sequence
+ * [54] = WS1.sequence
+ * [58] = WS2.sequence
+ * [5c] = WS3.sequence
+ */
+#ifdef DEBUG_PM
+ for (p = 0; p < mp_count; p++) {
+ fprintf(stderr, "======== MP[%d] =========\n", p);
+ const unsigned b = (0x60 / 4) * p;
+ for (c = 0; c < 24; c++) {
+ if (hq->data[b + c] == 0)
+ continue;
+ if (p == 4)
+ assert(0);
+ fprintf(stderr, "hq->data[%02x] = %08x\n", c * 4, hq->data[b + c]);
+ }
+ }
+#endif
for (p = 0; p < mp_count; ++p) {
const unsigned b = (0x60 / 4) * p;
-
for (c = 0; c < cfg->num_counters; ++c) {
count[p][c] = 0;
for (d = 0; d < ((hsq->ctr[c] & ~3) ? 1 : 4); ++d) {
+
+ //fprintf(stderr, "p=%d hq->data[%d + 20 + %d]=%d, hq->sequence=%d\n",
+ // p, b, d, hq->data[b + 20 + d], hq->sequence);
+
+ if (p == 4)
+ continue;
+
if (hq->data[b + 20 + d] != hq->sequence) {
if (!wait)
return false;
@@ -1986,7 +2716,11 @@ nve4_hw_sm_query_read_data(uint32_t count[32][8],
count[p][c] += hq->data[b + d * 4 + hsq->ctr[c]];
}
}
+#ifdef DEBUG_PM
+ fprintf(stderr, "count[%d][0]=%d\n", p, count[p][0]);
+#endif
}
+
return true;
}
@@ -2015,6 +2749,8 @@ nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
value += count[p][c];
value = (value * cfg->norm[0]) / cfg->norm[1];
+ // fprintf(stderr, "--> %d\n", value);
+
*(uint64_t *)result = value;
return true;
}
@@ -2121,7 +2857,7 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
if (id < count) {
if (screen->compute) {
- if (screen->base.class_3d <= NVF0_3D_CLASS) {
+ if (screen->base.class_3d <= GM200_3D_CLASS) {
const struct nvc0_hw_sm_query_cfg **queries =
nvc0_hw_sm_get_queries(screen);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h
index 65d6c8b..02f75cf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h
@@ -21,7 +21,8 @@ nvc0_hw_sm_query(struct nvc0_hw_query *hq)
#define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1)
enum nvc0_hw_sm_queries
{
- NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0,
+ NVC0_HW_SM_QUERY_ACTIVE_CTAS = 0,
+ NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
NVC0_HW_SM_QUERY_ACTIVE_WARPS,
NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
NVC0_HW_SM_QUERY_ATOM_COUNT,
@@ -29,12 +30,16 @@ enum nvc0_hw_sm_queries
NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
NVC0_HW_SM_QUERY_GLD_REQUEST,
NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
+ NVC0_HW_SM_QUERY_GLOBAL_ATOM_CAS,
+ NVC0_HW_SM_QUERY_GLOBAL_LD,
+ NVC0_HW_SM_QUERY_GLOBAL_ST,
NVC0_HW_SM_QUERY_GST_TRANSACTIONS,
NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
NVC0_HW_SM_QUERY_GRED_COUNT,
NVC0_HW_SM_QUERY_GST_REQUEST,
NVC0_HW_SM_QUERY_INST_EXECUTED,
NVC0_HW_SM_QUERY_INST_ISSUED,
+ NVC0_HW_SM_QUERY_INST_ISSUED0,
NVC0_HW_SM_QUERY_INST_ISSUED1,
NVC0_HW_SM_QUERY_INST_ISSUED2,
NVC0_HW_SM_QUERY_INST_ISSUED1_0,
@@ -64,10 +69,16 @@ enum nvc0_hw_sm_queries
NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
+ NVC0_HW_SM_QUERY_SHARED_ATOM,
+ NVC0_HW_SM_QUERY_SHARED_ATOM_CAS,
NVC0_HW_SM_QUERY_SHARED_LD,
+ NVC0_HW_SM_QUERY_SHARED_LD_BANK_CONFLICT,
NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
+ NVC0_HW_SM_QUERY_SHARED_LD_TRANSACTIONS,
NVC0_HW_SM_QUERY_SHARED_ST,
+ NVC0_HW_SM_QUERY_SHARED_ST_BANK_CONFLICT,
NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
+ NVC0_HW_SM_QUERY_SHARED_ST_TRANSACTIONS,
NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
--
2.10.2
More information about the mesa-dev
mailing list