[Mesa-dev] [PATCH 7/8] radeonsi: implement AMD_performance_monitor for CIK+

Nicolai Hähnle nhaehnle at gmail.com
Tue Nov 24 08:00:14 PST 2015


Expose most of the performance counter groups that are exposed by Catalyst.
Ideally, the driver will work with GPUPerfStudio at some point, but we are not
quite there yet. In any case, this is the reason for grouping multiple
instances of hardware blocks in the way it is implemented.

The counters can also be shown using the Gallium HUD. If one is interested to
see how work is distributed across multiple shader engines, one can set the
environment variable RADEON_PC_SEPARATE_SE=1 to obtain finer-grained performance
counter groups.

Part of the implementation is in radeon because an implementation for
older hardware would largely follow along the same lines, but exposing
a different set of blocks which are programmed slightly differently.
---
 src/gallium/drivers/radeon/Makefile.sources   |   1 +
 src/gallium/drivers/radeon/r600_perfcounter.c | 633 +++++++++++++++++++++++
 src/gallium/drivers/radeon/r600_pipe_common.c |   1 +
 src/gallium/drivers/radeon/r600_pipe_common.h |   7 +
 src/gallium/drivers/radeon/r600_query.c       |  21 +-
 src/gallium/drivers/radeon/r600_query.h       | 121 +++++
 src/gallium/drivers/radeonsi/Makefile.sources |   1 +
 src/gallium/drivers/radeonsi/si_perfcounter.c | 697 ++++++++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_pipe.c        |   3 +
 src/gallium/drivers/radeonsi/si_pipe.h        |   3 +
 10 files changed, 1485 insertions(+), 3 deletions(-)
 create mode 100644 src/gallium/drivers/radeon/r600_perfcounter.c
 create mode 100644 src/gallium/drivers/radeonsi/si_perfcounter.c

diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources
index 1dbad2f..eb171f7 100644
--- a/src/gallium/drivers/radeon/Makefile.sources
+++ b/src/gallium/drivers/radeon/Makefile.sources
@@ -4,6 +4,7 @@ C_SOURCES := \
 	r600_cs.h \
 	r600d_common.h \
 	r600_gpu_load.c \
+	r600_perfcounter.c \
 	r600_pipe_common.c \
 	r600_pipe_common.h \
 	r600_query.c \
diff --git a/src/gallium/drivers/radeon/r600_perfcounter.c b/src/gallium/drivers/radeon/r600_perfcounter.c
new file mode 100644
index 0000000..b4915d0
--- /dev/null
+++ b/src/gallium/drivers/radeon/r600_perfcounter.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *  Nicolai Hähnle <nicolai.haehnle at amd.com>
+ *
+ */
+
+#include "util/u_memory.h"
+#include "r600_query.h"
+#include "r600_pipe_common.h"
+#include "r600d_common.h"
+
+/* Max counters per HW block */
+#define R600_QUERY_MAX_COUNTERS 16
+
+static const char * const r600_pc_shader_suffix[] = {
+	"", "_PS", "_VS", "_GS", "_ES", "_HS", "_LS", "_CS"
+};
+
+static struct r600_perfcounter_block *
+lookup_counter(struct r600_perfcounters *pc, unsigned index,
+	       unsigned *base_gid, unsigned *sub_index)
+{
+	struct r600_perfcounter_block *block = pc->blocks;
+	unsigned bid;
+
+	*base_gid = 0;
+	for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
+		unsigned total = block->num_groups * block->num_selectors;
+
+		if (index < total) {
+			*sub_index = index;
+			return block;
+		}
+
+		index -= total;
+		*base_gid += block->num_groups;
+	}
+
+	return NULL;
+}
+
+static struct r600_perfcounter_block *
+lookup_group(struct r600_perfcounters *pc, unsigned *index)
+{
+	unsigned bid;
+	struct r600_perfcounter_block *block = pc->blocks;
+
+	for (bid = 0; bid < pc->num_blocks; ++bid, ++block) {
+		if (*index < block->num_groups)
+			return block;
+		*index -= block->num_groups;
+	}
+
+	return NULL;
+}
+
+struct r600_pc_group {
+	struct r600_pc_group *next;
+	struct r600_perfcounter_block *block;
+	unsigned sub_gid; /* only used during init */
+	unsigned result_base; /* only used during init */
+	int se;
+	int instance;
+	unsigned num_counters;
+	unsigned selectors[R600_QUERY_MAX_COUNTERS];
+};
+
+struct r600_pc_counter {
+	unsigned base;
+	unsigned dwords;
+	unsigned stride;
+};
+
+struct r600_query_pc {
+	struct r600_query_hw b;
+
+	unsigned shaders;
+	unsigned num_counters;
+	struct r600_pc_counter *counters;
+	struct r600_pc_group *groups;
+};
+
+static void r600_pc_query_destroy(struct r600_common_context *ctx,
+				  struct r600_query *rquery)
+{
+	struct r600_query_pc *query = (struct r600_query_pc *)rquery;
+
+	while (query->groups) {
+		struct r600_pc_group *group = query->groups;
+		query->groups = group->next;
+		FREE(group);
+	}
+
+	FREE(query->counters);
+
+	r600_query_hw_destroy(ctx, rquery);
+}
+
+static void r600_pc_query_emit_start(struct r600_common_context *ctx,
+				     struct r600_query_hw *hwquery,
+				     struct r600_resource *buffer, uint64_t va)
+{
+	struct r600_perfcounters *pc = ctx->screen->perfcounters;
+	struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
+	struct r600_pc_group *group;
+	int current_se = -1;
+	int current_instance = -1;
+
+	if (query->shaders)
+		pc->emit_shaders(ctx, query->shaders);
+
+	for (group = query->groups; group; group = group->next) {
+		struct r600_perfcounter_block *block = group->block;
+
+		if (group->se != current_se || group->instance != current_instance) {
+			current_se = group->se;
+			current_instance = group->instance;
+			pc->emit_instance(ctx, group->se, group->instance);
+		}
+
+		pc->emit_select(ctx, block, group->num_counters, group->selectors);
+	}
+
+	if (current_se != -1 || current_instance != -1)
+		pc->emit_instance(ctx, -1, -1);
+
+	pc->emit_start(ctx, buffer, va);
+}
+
+static void r600_pc_query_emit_stop(struct r600_common_context *ctx,
+				    struct r600_query_hw *hwquery,
+				    struct r600_resource *buffer, uint64_t va)
+{
+	struct r600_perfcounters *pc = ctx->screen->perfcounters;
+	struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
+	struct r600_pc_group *group;
+
+	pc->emit_stop(ctx, buffer, va);
+
+	for (group = query->groups; group; group = group->next) {
+		struct r600_perfcounter_block *block = group->block;
+		unsigned se = group->se >= 0 ? group->se : 0;
+		unsigned se_end = se + 1;
+
+		if ((block->flags & R600_PC_BLOCK_SE) && (group->se < 0))
+			se_end = ctx->screen->info.max_se;
+
+		do {
+			unsigned instance = group->instance >= 0 ? group->instance : 0;
+
+			do {
+				pc->emit_instance(ctx, se, instance);
+				pc->emit_read(ctx, block,
+					      group->num_counters, group->selectors,
+					      buffer, va);
+				va += 4 * group->num_counters;
+			} while (group->instance < 0 && ++instance < block->num_instances);
+		} while (++se < se_end);
+	}
+
+	pc->emit_instance(ctx, -1, -1);
+}
+
+static void r600_pc_query_clear_result(struct r600_query_hw *hwquery,
+				       union pipe_query_result *result)
+{
+	struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
+
+	memset(result, 0, sizeof(result->batch[0]) * query->num_counters);
+}
+
+static void r600_pc_query_add_result(struct r600_common_context *ctx,
+				     struct r600_query_hw *hwquery,
+				     void *buffer,
+				     union pipe_query_result *result)
+{
+	struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
+	uint32_t *results = buffer;
+	unsigned i, j;
+
+	for (i = 0; i < query->num_counters; ++i) {
+		struct r600_pc_counter *counter = &query->counters[i];
+
+		if (counter->base == ~0)
+			continue;
+
+		for (j = 0; j < counter->dwords; ++j) {
+			uint32_t value = results[counter->base + j * counter->stride];
+			result->batch[i].u32 += value;
+		}
+	}
+}
+
+static struct r600_query_ops batch_query_ops = {
+	.destroy = r600_pc_query_destroy,
+	.begin = r600_query_hw_begin,
+	.end = r600_query_hw_end,
+	.get_result = r600_query_hw_get_result
+};
+
+static struct r600_query_hw_ops batch_query_hw_ops = {
+	.emit_start = r600_pc_query_emit_start,
+	.emit_stop = r600_pc_query_emit_stop,
+	.clear_result = r600_pc_query_clear_result,
+	.add_result = r600_pc_query_add_result,
+};
+
+static struct r600_pc_group *get_group_state(struct r600_common_screen *screen,
+					     struct r600_query_pc *query,
+					     struct r600_perfcounter_block *block,
+					     unsigned sub_gid)
+{
+	struct r600_pc_group *group = query->groups;
+
+	while (group) {
+		if (group->block == block && group->sub_gid == sub_gid)
+			return group;
+		group = group->next;
+	}
+
+	group = CALLOC_STRUCT(r600_pc_group);
+	if (!group)
+		return NULL;
+
+	group->block = block;
+	group->sub_gid = sub_gid;
+
+	if (block->flags & R600_PC_BLOCK_SHADER) {
+		unsigned sub_gids = block->num_instances;
+		unsigned shader_id;
+		unsigned shader_mask;
+		unsigned query_shader_mask;
+
+		if (block->flags & R600_PC_BLOCK_SE_GROUPS)
+			sub_gids = sub_gids * screen->info.max_se;
+		shader_id = sub_gid / sub_gids;
+		sub_gid = sub_gid % sub_gids;
+
+		if (shader_id == 0)
+			shader_mask = R600_PC_SHADER_ALL;
+		else
+			shader_mask = 1 << (shader_id - 1);
+
+		query_shader_mask = query->shaders & R600_PC_SHADER_ALL;
+		if (query_shader_mask && query_shader_mask != shader_mask) {
+			fprintf(stderr, "r600_perfcounter: incompatible shader groups\n");
+			FREE(group);
+			return NULL;
+		}
+		query->shaders |= shader_mask;
+	}
+
+	if (block->flags & R600_PC_BLOCK_SHADER_WINDOWED)
+		query->shaders |= R600_PC_SHADER_WINDOWING;
+
+	if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
+		group->se = sub_gid / block->num_instances;
+		sub_gid = sub_gid % block->num_instances;
+	} else {
+		group->se = -1;
+	}
+
+	if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
+		group->instance = sub_gid;
+	} else {
+		group->instance = -1;
+	}
+
+	group->next = query->groups;
+	query->groups = group;
+
+	return group;
+}
+
+struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
+					   unsigned num_queries,
+					   unsigned *query_types)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	struct r600_common_screen *screen = rctx->screen;
+	struct r600_perfcounters *pc = screen->perfcounters;
+	struct r600_perfcounter_block *block;
+	struct r600_pc_group *group;
+	struct r600_query_pc *query;
+	unsigned base_gid, sub_gid, sub_index;
+	unsigned i, j;
+
+	if (!pc)
+		return NULL;
+
+	query = CALLOC_STRUCT(r600_query_pc);
+	if (!query)
+		return NULL;
+
+	query->b.b.ops = &batch_query_ops;
+	query->b.ops = &batch_query_hw_ops;
+	query->b.flags = R600_QUERY_HW_FLAG_TIMER;
+
+	query->num_counters = num_queries;
+
+	/* Collect selectors per group */
+	for (i = 0; i < num_queries; ++i) {
+		unsigned sub_gid;
+
+		if (query_types[i] < R600_QUERY_FIRST_PERFCOUNTER)
+			goto error;
+
+		block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER,
+				       &base_gid, &sub_index);
+		if (!block)
+			goto error;
+
+		sub_gid = sub_index / block->num_selectors;
+		sub_index = sub_index % block->num_selectors;
+
+		group = get_group_state(screen, query, block, sub_gid);
+		if (!group)
+			goto error;
+
+		if (group->num_counters >= block->num_counters) {
+			fprintf(stderr,
+				"perfcounter group %s: too many selected\n",
+				block->basename);
+			goto error;
+		}
+		group->selectors[group->num_counters] = sub_index;
+		++group->num_counters;
+	}
+
+	/* Compute result bases and CS size per group */
+	query->b.num_cs_dw_begin = pc->num_start_cs_dwords;
+	query->b.num_cs_dw_end = pc->num_stop_cs_dwords;
+
+	query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */
+	query->b.num_cs_dw_end += pc->num_instance_cs_dwords;
+
+	i = 0;
+	for (group = query->groups; group; group = group->next) {
+		struct r600_perfcounter_block *block = group->block;
+		unsigned select_dw, read_dw;
+		unsigned instances = 1;
+
+		if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
+			instances = rctx->screen->info.max_se;
+		if (group->instance < 0)
+			instances *= block->num_instances;
+
+		group->result_base = i;
+		query->b.result_size += 4 * instances * group->num_counters;
+		i += instances * group->num_counters;
+
+		pc->get_size(block, group->num_counters, group->selectors,
+			     &select_dw, &read_dw);
+		query->b.num_cs_dw_begin += select_dw;
+		query->b.num_cs_dw_end += instances * read_dw;
+		query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */
+		query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords;
+	}
+
+	if (query->shaders) {
+		if ((query->shaders & R600_PC_SHADER_ALL) == 0)
+			query->shaders |= R600_PC_SHADER_ALL;
+		query->b.num_cs_dw_begin += pc->num_shaders_cs_dwords;
+	}
+
+	/* Map user-supplied query array to result indices */
+	query->counters = CALLOC(num_queries, sizeof(*query->counters));
+	for (i = 0; i < num_queries; ++i) {
+		struct r600_pc_counter *counter = &query->counters[i];
+		struct r600_perfcounter_block *block;
+
+		block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER,
+				       &base_gid, &sub_index);
+
+		sub_gid = sub_index / block->num_selectors;
+		sub_index = sub_index % block->num_selectors;
+
+		group = get_group_state(screen, query, block, sub_gid);
+		assert(group != NULL);
+
+		for (j = 0; j < group->num_counters; ++j) {
+			if (group->selectors[j] == sub_index)
+				break;
+		}
+
+		counter->base = group->result_base + j;
+		counter->stride = group->num_counters;
+
+		counter->dwords = 1;
+		if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
+			counter->dwords = screen->info.max_se;
+		if (group->instance < 0)
+			counter->dwords *= block->num_instances;
+	}
+
+	if (!r600_query_hw_init(rctx, &query->b))
+		goto error;
+
+	return (struct pipe_query *)query;
+
+error:
+	r600_pc_query_destroy(rctx, &query->b.b);
+	return NULL;
+}
+
+int r600_get_perfcounter_info(struct r600_common_screen *screen,
+			      unsigned index,
+			      struct pipe_driver_query_info *info)
+{
+	struct r600_perfcounters *pc = screen->perfcounters;
+	struct r600_perfcounter_block *block;
+	unsigned base_gid, sub;
+
+	if (!pc)
+		return 0;
+
+	if (!info) {
+		unsigned bid, num_queries = 0;
+
+		for (bid = 0; bid < pc->num_blocks; ++bid) {
+			num_queries += pc->blocks[bid].num_selectors *
+				       pc->blocks[bid].num_groups;
+		}
+
+		return num_queries;
+	}
+
+	block = lookup_counter(pc, index, &base_gid, &sub);
+	if (!block)
+		return 0;
+
+	info->name = block->selector_names + sub * block->selector_name_stride;
+	info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index;
+	info->max_value.u64 = 0;
+	info->type = PIPE_DRIVER_QUERY_TYPE_UINT;
+	info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
+	info->group_id = base_gid + sub / block->num_selectors;
+	info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
+	return 1;
+}
+
+int r600_get_perfcounter_group_info(struct r600_common_screen *screen,
+				    unsigned index,
+				    struct pipe_driver_query_group_info *info)
+{
+	struct r600_perfcounters *pc = screen->perfcounters;
+	struct r600_perfcounter_block *block;
+
+	if (!pc)
+		return 0;
+
+	if (!info)
+		return pc->num_groups;
+
+	block = lookup_group(pc, &index);
+	if (!block)
+		return 0;
+	info->name = block->group_names + index * block->group_name_stride;
+	info->num_queries = block->num_selectors;
+	info->max_active_queries = block->num_counters;
+	return 1;
+}
+
+void r600_perfcounters_destroy(struct r600_common_screen *rscreen)
+{
+	if (rscreen->perfcounters)
+		rscreen->perfcounters->cleanup(rscreen);
+}
+
+boolean r600_perfcounters_init(struct r600_perfcounters *pc,
+			       unsigned num_blocks)
+{
+	pc->blocks = CALLOC(num_blocks, sizeof(struct r600_perfcounter_block));
+	if (!pc->blocks)
+		return FALSE;
+
+	pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", FALSE);
+	pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", FALSE);
+
+	return TRUE;
+}
+
+boolean r600_perfcounters_add_block(struct r600_common_screen *rscreen,
+				    struct r600_perfcounters *pc,
+				    const char *name, unsigned flags,
+				    unsigned counters, unsigned selectors,
+				    unsigned instances, void *data)
+{
+	struct r600_perfcounter_block *block = &pc->blocks[pc->num_blocks];
+	unsigned i, j, k;
+	unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
+	unsigned namelen;
+	char *groupname;
+	char *p;
+
+	assert(counters <= R600_QUERY_MAX_COUNTERS);
+
+	block->basename = name;
+	block->flags = flags;
+	block->num_counters = counters;
+	block->num_selectors = selectors;
+	block->num_instances = MAX2(instances, 1);
+	block->data = data;
+
+	if (pc->separate_se && (block->flags & R600_PC_BLOCK_SE))
+		block->flags |= R600_PC_BLOCK_SE_GROUPS;
+	if (pc->separate_instance && block->num_instances > 1)
+		block->flags |= R600_PC_BLOCK_INSTANCE_GROUPS;
+
+	if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
+		groups_instance = block->num_instances;
+		block->num_groups = groups_instance;
+	} else {
+		block->num_groups = 1;
+	}
+
+	if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
+		groups_se = rscreen->info.max_se;
+		block->num_groups *= groups_se;
+	}
+
+	if (block->flags & R600_PC_BLOCK_SHADER) {
+		groups_shader = ARRAY_SIZE(r600_pc_shader_suffix);
+		block->num_groups *= groups_shader;
+	}
+
+	namelen = strlen(name);
+	block->group_name_stride = namelen + 1;
+	if (block->flags & R600_PC_BLOCK_SHADER)
+		block->group_name_stride += 3;
+	if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
+		assert(groups_se <= 10);
+		block->group_name_stride += 1;
+
+		if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
+			block->group_name_stride += 1;
+	}
+	if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) {
+		assert(groups_instance <= 100);
+		block->group_name_stride += 2;
+	}
+
+	block->group_names = MALLOC(block->num_groups * block->group_name_stride);
+	if (!block->group_names)
+		goto error;
+
+	groupname = block->group_names;
+	for (i = 0; i < groups_shader; ++i) {
+		unsigned shaderlen = strlen(r600_pc_shader_suffix[i]);
+		for (j = 0; j < groups_se; ++j) {
+			for (k = 0; k < groups_instance; ++k) {
+				strcpy(groupname, name);
+				p = groupname + namelen;
+
+				if (block->flags & R600_PC_BLOCK_SHADER) {
+					strcpy(p, r600_pc_shader_suffix[i]);
+					p += shaderlen;
+				}
+
+				if (block->flags & R600_PC_BLOCK_SE_GROUPS) {
+					p += sprintf(p, "%d", j);
+					if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
+						*p++ = '_';
+				}
+
+				if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS)
+					p += sprintf(p, "%d", k);
+
+				groupname += block->group_name_stride;
+			}
+		}
+	}
+
+	assert(selectors <= 1000);
+	block->selector_name_stride = block->group_name_stride + 4;
+	block->selector_names = MALLOC(block->num_groups * selectors *
+				       block->selector_name_stride);
+	if (!block->selector_names)
+		goto error_groupnames;
+
+	groupname = block->group_names;
+	p = block->selector_names;
+	for (i = 0; i < block->num_groups; ++i) {
+		for (j = 0; j < selectors; ++j) {
+			sprintf(p, "%s_%03d", groupname, j);
+			p += block->selector_name_stride;
+		}
+		groupname += block->group_name_stride;
+	}
+
+	++pc->num_blocks;
+	pc->num_groups += block->num_groups;
+
+	return TRUE;
+
+error_groupnames:
+	FREE(block->group_names);
+error:
+	return FALSE;
+}
+
+void r600_perfcounters_do_destroy(struct r600_perfcounters *pc)
+{
+	unsigned i;
+
+	for (i = 0; i < pc->num_blocks; ++i) {
+		FREE(pc->blocks[i].group_names);
+		FREE(pc->blocks[i].selector_names);
+	}
+	FREE(pc->blocks);
+	FREE(pc);
+}
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 7464f67..f03dcd9 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -977,6 +977,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 
 void r600_destroy_common_screen(struct r600_common_screen *rscreen)
 {
+	r600_perfcounters_destroy(rscreen);
 	r600_gpu_load_kill_thread(rscreen);
 
 	pipe_mutex_destroy(rscreen->gpu_load_mutex);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index fbdc5c4..253d657 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -90,6 +90,7 @@
 #define R600_MAP_BUFFER_ALIGNMENT 64
 
 struct r600_common_context;
+struct r600_perfcounters;
 
 struct radeon_shader_reloc {
 	char *name;
@@ -300,6 +301,9 @@ struct r600_common_screen {
 	volatile unsigned		gpu_load_stop_thread; /* bool */
 
 	char				renderer_string[64];
+
+	/* Performance counters. */
+	struct r600_perfcounters	*perfcounters;
 };
 
 /* This encapsulates a state or an operation which can emitted into the GPU
@@ -508,6 +512,9 @@ void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen);
 uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen);
 unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin);
 
+/* r600_perfcounters.c */
+void r600_perfcounters_destroy(struct r600_common_screen *rscreen);
+
 /* r600_query.c */
 void r600_init_screen_query_functions(struct r600_common_screen *rscreen);
 void r600_query_init(struct r600_common_context *rctx);
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 38bbbbf..09eabab 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -1141,11 +1141,15 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 	unsigned num_queries = r600_get_num_queries(rscreen);
 
-	if (!info)
-		return num_queries;
+	if (!info) {
+		unsigned num_perfcounters =
+			r600_get_perfcounter_info(rscreen, 0, NULL);
+
+		return num_queries + num_perfcounters;
+	}
 
 	if (index >= num_queries)
-		return 0;
+		return r600_get_perfcounter_info(rscreen, index - num_queries, info);
 
 	*info = r600_driver_query_list[index];
 
@@ -1166,9 +1170,19 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 	return 1;
 }
 
+static int r600_get_driver_query_group_info(struct pipe_screen *screen,
+					    unsigned index,
+					    struct pipe_driver_query_group_info *info)
+{
+	struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
+
+	return r600_get_perfcounter_group_info(rscreen, index, info);
+}
+
 void r600_query_init(struct r600_common_context *rctx)
 {
 	rctx->b.create_query = r600_create_query;
+	rctx->b.create_batch_query = r600_create_batch_query;
 	rctx->b.destroy_query = r600_destroy_query;
 	rctx->b.begin_query = r600_begin_query;
 	rctx->b.end_query = r600_end_query;
@@ -1185,4 +1199,5 @@ void r600_query_init(struct r600_common_context *rctx)
 void r600_init_screen_query_functions(struct r600_common_screen *rscreen)
 {
 	rscreen->b.get_driver_query_info = r600_get_driver_query_info;
+	rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info;
 }
diff --git a/src/gallium/drivers/radeon/r600_query.h b/src/gallium/drivers/radeon/r600_query.h
index 0ea5707..64ac916 100644
--- a/src/gallium/drivers/radeon/r600_query.h
+++ b/src/gallium/drivers/radeon/r600_query.h
@@ -31,7 +31,11 @@
 #include "pipe/p_defines.h"
 #include "util/list.h"
 
+struct pipe_context;
+struct pipe_query;
+
 struct r600_common_context;
+struct r600_common_screen;
 struct r600_query;
 struct r600_query_hw;
 struct r600_resource;
@@ -133,4 +137,121 @@ boolean r600_query_hw_get_result(struct r600_common_context *rctx,
 				 boolean wait,
 				 union pipe_query_result *result);
 
+/* Performance counters */
+enum {
+	/* This block is part of the shader engine */
+	R600_PC_BLOCK_SE = (1 << 0),
+
+	/* Expose per-instance groups instead of summing all instances (within
+	 * an SE). */
+	R600_PC_BLOCK_INSTANCE_GROUPS = (1 << 1),
+
+	/* Expose per-SE groups instead of summing instances across SEs. */
+	R600_PC_BLOCK_SE_GROUPS = (1 << 2),
+
+	/* Shader block */
+	R600_PC_BLOCK_SHADER = (1 << 3),
+
+	/* Non-shader block with perfcounters windowed by shaders. */
+	R600_PC_BLOCK_SHADER_WINDOWED = (1 << 4),
+};
+
+/* Shader enable bits. Chosen to coincide with SQ_PERFCOUNTER_CTRL values */
+enum {
+	R600_PC_SHADER_PS = (1 << 0),
+	R600_PC_SHADER_VS = (1 << 1),
+	R600_PC_SHADER_GS = (1 << 2),
+	R600_PC_SHADER_ES = (1 << 3),
+	R600_PC_SHADER_HS = (1 << 4),
+	R600_PC_SHADER_LS = (1 << 5),
+	R600_PC_SHADER_CS = (1 << 6),
+
+	R600_PC_SHADER_ALL = R600_PC_SHADER_PS | R600_PC_SHADER_VS |
+			     R600_PC_SHADER_GS | R600_PC_SHADER_ES |
+			     R600_PC_SHADER_HS | R600_PC_SHADER_LS |
+			     R600_PC_SHADER_CS,
+
+	R600_PC_SHADER_WINDOWING = (1 << 31),
+};
+
+/* Describes a hardware block with performance counters. Multiple instances of
+ * each block, possibly per-SE, may exist on the chip. Depending on the block
+ * and on the user's configuration, we either
+ *  (a) expose every instance as a performance counter group,
+ *  (b) expose a single performance counter group that reports the sum over all
+ *      instances, or
+ *  (c) expose one performance counter group per instance, but summed over all
+ *      shader engines.
+ */
+struct r600_perfcounter_block {
+	const char *basename;
+	unsigned flags;
+	unsigned num_counters;
+	unsigned num_selectors;
+	unsigned num_instances;
+
+	unsigned num_groups;
+	char *group_names;
+	unsigned group_name_stride;
+
+	char *selector_names;
+	unsigned selector_name_stride;
+
+	void *data;
+};
+
+struct r600_perfcounters {
+	unsigned num_groups;
+	unsigned num_blocks;
+	struct r600_perfcounter_block *blocks;
+
+	unsigned num_start_cs_dwords;
+	unsigned num_stop_cs_dwords;
+	unsigned num_instance_cs_dwords;
+	unsigned num_shaders_cs_dwords;
+
+	void (*get_size)(struct r600_perfcounter_block *,
+			 unsigned count, unsigned *selectors,
+			 unsigned *num_select_dw, unsigned *num_read_dw);
+
+	void (*emit_instance)(struct r600_common_context *,
+			      int se, int instance);
+	void (*emit_shaders)(struct r600_common_context *, unsigned shaders);
+	void (*emit_select)(struct r600_common_context *,
+			    struct r600_perfcounter_block *,
+			    unsigned count, unsigned *selectors);
+	void (*emit_start)(struct r600_common_context *,
+			  struct r600_resource *buffer, uint64_t va);
+	void (*emit_stop)(struct r600_common_context *,
+			  struct r600_resource *buffer, uint64_t va);
+	void (*emit_read)(struct r600_common_context *,
+			  struct r600_perfcounter_block *,
+			  unsigned count, unsigned *selectors,
+			  struct r600_resource *buffer, uint64_t va);
+
+	void (*cleanup)(struct r600_common_screen *);
+
+	boolean separate_se;
+	boolean separate_instance;
+};
+
+struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
+					   unsigned num_queries,
+					   unsigned *query_types);
+
+int r600_get_perfcounter_info(struct r600_common_screen *,
+			      unsigned index,
+			      struct pipe_driver_query_info *info);
+int r600_get_perfcounter_group_info(struct r600_common_screen *,
+				    unsigned index,
+				    struct pipe_driver_query_group_info *info);
+
+boolean r600_perfcounters_init(struct r600_perfcounters *, unsigned num_blocks);
+boolean r600_perfcounters_add_block(struct r600_common_screen *,
+				    struct r600_perfcounters *,
+				    const char *name, unsigned flags,
+				    unsigned counters, unsigned selectors,
+				    unsigned instances, void *data);
+void r600_perfcounters_do_destroy(struct r600_perfcounters *);
+
 #endif /* R600_QUERY_H */
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 7e997c6..53404ab 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -12,6 +12,7 @@ C_SOURCES := \
 	si_pipe.h \
 	si_pm4.c \
 	si_pm4.h \
+	si_perfcounter.c \
 	si_public.h \
 	si_shader.c \
 	si_shader.h \
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
new file mode 100644
index 0000000..974a31b
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *  Nicolai Hähnle <nicolai.haehnle at amd.com>
+ *
+ */
+
+#include "radeon/r600_cs.h"
+#include "radeon/r600_query.h"
+#include "radeon/r600_pipe_common.h"
+#include "util/u_memory.h"
+
+#include "si_pipe.h"
+#include "sid.h"
+
+enum si_pc_reg_layout {
+	/* All secondary selector dwords follow as one block after the primary
+	 * selector dwords for the counters that have secondary selectors.
+	 */
+	SI_PC_MULTI_BLOCK = 0,
+
+	/* Each secondary selector dword follows immediately afters the
+	 * corresponding primary.
+	 */
+	SI_PC_MULTI_ALTERNATE = 1,
+
+	/* All secondary selector dwords follow as one block after all primary
+	 * selector dwords.
+	 */
+	SI_PC_MULTI_TAIL = 2,
+
+	/* Free-form arrangement of selector registers. */
+	SI_PC_MULTI_CUSTOM = 3,
+
+	SI_PC_MULTI_MASK = 3,
+
+	/* Registers are laid out in decreasing rather than increasing order. */
+	SI_PC_REG_REVERSE = 4,
+};
+
+struct si_pc_block_base {
+	const char *name;
+	unsigned num_counters;
+	unsigned flags;
+
+	unsigned select_or;
+	unsigned select0;
+	unsigned counter0_lo;
+	unsigned *select;
+	unsigned *counters;
+	unsigned num_multi;
+	unsigned num_prelude;
+	unsigned layout;
+};
+
+struct si_pc_block {
+	struct si_pc_block_base *b;
+	unsigned selectors;
+	unsigned instances;
+};
+
+
+static struct si_pc_block_base cik_CB = {
+	.name = "CB",
+	.num_counters = 4,
+	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS,
+
+	.select0 = R_037000_CB_PERFCOUNTER_FILTER,
+	.counter0_lo = R_035018_CB_PERFCOUNTER0_LO,
+	.num_multi = 1,
+	.num_prelude = 1,
+	.layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static unsigned cik_CPC_select[] = {
+	R_036024_CPC_PERFCOUNTER0_SELECT,
+	R_036010_CPC_PERFCOUNTER0_SELECT1,
+	R_03600C_CPC_PERFCOUNTER1_SELECT,
+};
+static struct si_pc_block_base cik_CPC = {
+	.name = "CPC",
+	.num_counters = 2,
+
+	.select = cik_CPC_select,
+	.counter0_lo = R_034018_CPC_PERFCOUNTER0_LO,
+	.num_multi = 1,
+	.layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE,
+};
+
+static struct si_pc_block_base cik_CPF = {
+	.name = "CPF",
+	.num_counters = 2,
+
+	.select0 = R_03601C_CPF_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034028_CPF_PERFCOUNTER0_LO,
+	.num_multi = 1,
+	.layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
+};
+
+static struct si_pc_block_base cik_CPG = {
+	.name = "CPG",
+	.num_counters = 2,
+
+	.select0 = R_036008_CPG_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034008_CPG_PERFCOUNTER0_LO,
+	.num_multi = 1,
+	.layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE,
+};
+
+static struct si_pc_block_base cik_DB = {
+	.name = "DB",
+	.num_counters = 4,
+	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS,
+
+	.select0 = R_037100_DB_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_035100_DB_PERFCOUNTER0_LO,
+	.num_multi = 3, // really only 2, but there's a gap between registers
+	.layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base cik_GDS = {
+	.name = "GDS",
+	.num_counters = 4,
+
+	.select0 = R_036A00_GDS_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO,
+	.num_multi = 1,
+	.layout = SI_PC_MULTI_TAIL,
+};
+
+static unsigned cik_GRBM_counters[] = {
+	R_034100_GRBM_PERFCOUNTER0_LO,
+	R_03410C_GRBM_PERFCOUNTER1_LO,
+};
+static struct si_pc_block_base cik_GRBM = {
+	.name = "GRBM",
+	.num_counters = 2,
+
+	.select0 = R_036100_GRBM_PERFCOUNTER0_SELECT,
+	.counters = cik_GRBM_counters,
+};
+
+static struct si_pc_block_base cik_GRBMSE = {
+	.name = "GRBMSE",
+	.num_counters = 4,
+
+	.select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT,
+	.counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO,
+};
+
+static struct si_pc_block_base cik_IA = {
+	.name = "IA",
+	.num_counters = 4,
+
+	.select0 = R_036210_IA_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034220_IA_PERFCOUNTER0_LO,
+	.num_multi = 1,
+	.layout = SI_PC_MULTI_TAIL,
+};
+
+static struct si_pc_block_base cik_PA_SC = {
+	.name = "PA_SC",
+	.num_counters = 8,
+	.flags = R600_PC_BLOCK_SE,
+
+	.select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO,
+	.num_multi = 1,
+	.layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base cik_PA_SU = {
+	.name = "PA_SU",
+	.num_counters = 4,
+	.flags = R600_PC_BLOCK_SE,
+
+	.select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO,
+	.num_multi = 2,
+	.layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base cik_SPI = {
+	.name = "SPI",
+	.num_counters = 6,
+	.flags = R600_PC_BLOCK_SE,
+
+	.select0 = R_036600_SPI_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034604_SPI_PERFCOUNTER0_LO,
+	.num_multi = 4,
+	.layout = SI_PC_MULTI_BLOCK,
+};
+
+static struct si_pc_block_base cik_SQ = {
+	.name = "SQ",
+	.num_counters = 16,
+	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_SHADER,
+
+	.select0 = R_036700_SQ_PERFCOUNTER0_SELECT,
+	.select_or = S_036700_SQC_BANK_MASK(15) |
+			S_036700_SQC_CLIENT_MASK(15) |
+			S_036700_SIMD_MASK(15),
+	.counter0_lo = R_034700_SQ_PERFCOUNTER0_LO,
+};
+
+static struct si_pc_block_base cik_SX = {
+	.name = "SX",
+	.num_counters = 4,
+	.flags = R600_PC_BLOCK_SE,
+
+	.select0 = R_036900_SX_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034900_SX_PERFCOUNTER0_LO,
+	.num_multi = 2,
+	.layout = SI_PC_MULTI_TAIL,
+};
+
+static struct si_pc_block_base cik_TA = {
+	.name = "TA",
+	.num_counters = 2,
+	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
+
+	.select0 = R_036B00_TA_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034B00_TA_PERFCOUNTER0_LO,
+	.num_multi = 1,
+	.layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base cik_TD = {
+	.name = "TD",
+	.num_counters = 2,
+	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
+
+	.select0 = R_036C00_TD_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034C00_TD_PERFCOUNTER0_LO,
+	.num_multi = 1,
+	.layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base cik_TCA = {
+	.name = "TCA",
+	.num_counters = 4,
+	.flags = R600_PC_BLOCK_INSTANCE_GROUPS,
+
+	.select0 = R_036E40_TCA_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO,
+	.num_multi = 2,
+	.layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base cik_TCC = {
+	.name = "TCC",
+	.num_counters = 4,
+	.flags = R600_PC_BLOCK_INSTANCE_GROUPS,
+
+	.select0 = R_036E00_TCC_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO,
+	.num_multi = 2,
+	.layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base cik_TCP = {
+	.name = "TCP",
+	.num_counters = 4,
+	.flags = R600_PC_BLOCK_SE | R600_PC_BLOCK_INSTANCE_GROUPS | R600_PC_BLOCK_SHADER_WINDOWED,
+
+	.select0 = R_036D00_TCP_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO,
+	.num_multi = 2,
+	.layout = SI_PC_MULTI_ALTERNATE,
+};
+
+static struct si_pc_block_base cik_VGT = {
+	.name = "VGT",
+	.num_counters = 4,
+	.flags = R600_PC_BLOCK_SE,
+
+	.select0 = R_036230_VGT_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034240_VGT_PERFCOUNTER0_LO,
+	.num_multi = 1,
+	.layout = SI_PC_MULTI_TAIL,
+};
+
+static struct si_pc_block_base cik_WD = {
+	.name = "WD",
+	.num_counters = 4,
+
+	.select0 = R_036200_WD_PERFCOUNTER0_SELECT,
+	.counter0_lo = R_034200_WD_PERFCOUNTER0_LO,
+};
+
+/* Both the number of instances and selectors varies between chips of the same
+ * class. We only differentiate by class here and simply expose the maximum
+ * number over all chips in a class.
+ */
+static struct si_pc_block groups_CIK[] = {
+	{ &cik_CB, 226, 4 },
+	{ &cik_CPC, 22 },
+	{ &cik_CPF, 17 },
+	{ &cik_CPG, 46 },
+	{ &cik_DB, 257, 4 },
+	{ &cik_GDS, 121 },
+	{ &cik_GRBM, 34 },
+	{ &cik_GRBMSE, 15 },
+	{ &cik_IA, 22 },
+	{ &cik_PA_SC, 395 },
+	{ &cik_PA_SU, 153 },
+	{ &cik_SPI, 186 },
+	{ &cik_SQ, 252 },
+	{ &cik_SX, 32 },
+	{ &cik_TA, 111, 11 },
+	{ &cik_TCA, 39, 2 },
+	{ &cik_TCC, 160, 16 },
+	{ &cik_TCP, 154, 11 },
+	{ &cik_TD, 55, 11 },
+	{ &cik_VGT, 140 },
+	{ &cik_WD, 22 },
+};
+
+static struct si_pc_block groups_VI[] = {
+	{ &cik_CB, 396, 4 },
+	{ &cik_CPC, 24 },
+	{ &cik_CPF, 19 },
+	{ &cik_CPG, 48 },
+	{ &cik_DB, 257, 4 },
+	{ &cik_GDS, 121 },
+	{ &cik_GRBM, 34 },
+	{ &cik_GRBMSE, 15 },
+	{ &cik_IA, 24 },
+	{ &cik_PA_SC, 397 },
+	{ &cik_PA_SU, 153 },
+	{ &cik_SPI, 197 },
+	{ &cik_SQ, 273 },
+	{ &cik_SX, 34 },
+	{ &cik_TA, 119, 16 },
+	{ &cik_TCA, 35, 2 },
+	{ &cik_TCC, 192, 16 },
+	{ &cik_TCP, 180, 16 },
+	{ &cik_TD, 55, 16 },
+	{ &cik_VGT, 147 },
+	{ &cik_WD, 37 },
+};
+
+static void si_pc_get_size(struct r600_perfcounter_block *group,
+			unsigned count, unsigned *selectors,
+			unsigned *num_select_dw, unsigned *num_read_dw)
+{
+	struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
+	struct si_pc_block_base *regs = sigroup->b;
+	unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
+
+	if (layout_multi == SI_PC_MULTI_BLOCK) {
+		if (count < regs->num_multi)
+			*num_select_dw = 2 * (count + 2) + regs->num_prelude;
+		else
+			*num_select_dw = 2 + count + regs->num_multi + regs->num_prelude;
+	} else if (layout_multi == SI_PC_MULTI_TAIL) {
+		*num_select_dw = 4 + count + MIN2(count, regs->num_multi) + regs->num_prelude;
+	} else if (layout_multi == SI_PC_MULTI_CUSTOM) {
+		assert(regs->num_prelude == 0);
+		*num_select_dw = 3 * (count + MIN2(count, regs->num_multi));
+	} else {
+		assert(layout_multi == SI_PC_MULTI_ALTERNATE);
+
+		*num_select_dw = 2 + count + MIN2(count, regs->num_multi) + regs->num_prelude;
+	}
+
+	*num_read_dw = 6 * count;
+}
+
+static void si_pc_emit_instance(struct r600_common_context *ctx,
+				int se, int instance)
+{
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	unsigned value = S_030800_SH_BROADCAST_WRITES(1);
+
+	if (se >= 0) {
+		value |= S_030800_SE_INDEX(se);
+	} else {
+		value |= S_030800_SE_BROADCAST_WRITES(1);
+	}
+
+	if (instance >= 0) {
+		value |= S_030800_INSTANCE_INDEX(instance);
+	} else {
+		value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
+	}
+
+	radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
+}
+
+static void si_pc_emit_shaders(struct r600_common_context *ctx,
+			       unsigned shaders)
+{
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+
+	radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
+	radeon_emit(cs, shaders & 0x7f);
+	radeon_emit(cs, 0xffffffff);
+}
+
+static void si_pc_emit_select(struct r600_common_context *ctx,
+		        struct r600_perfcounter_block *group,
+		        unsigned count, unsigned *selectors)
+{
+	struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
+	struct si_pc_block_base *regs = sigroup->b;
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	unsigned idx;
+	unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
+	unsigned dw;
+
+	assert(count <= regs->num_counters);
+
+	if (layout_multi == SI_PC_MULTI_BLOCK) {
+		assert(!(regs->layout & SI_PC_REG_REVERSE));
+
+		dw = count + regs->num_prelude;
+		if (count >= regs->num_multi)
+			count += regs->num_multi;
+		radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
+		for (idx = 0; idx < regs->num_prelude; ++idx)
+			radeon_emit(cs, 0);
+		for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
+			radeon_emit(cs, selectors[idx] | regs->select_or);
+
+		if (count < regs->num_multi) {
+			unsigned select1 =
+				regs->select0 + 4 * regs->num_multi;
+			radeon_set_uconfig_reg_seq(cs, select1, count);
+		}
+
+		for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx)
+			radeon_emit(cs, 0);
+
+		if (count > regs->num_multi) {
+			for (idx = regs->num_multi; idx < count; ++idx)
+				radeon_emit(cs, selectors[idx] | regs->select_or);
+		}
+	} else if (layout_multi == SI_PC_MULTI_TAIL) {
+		unsigned select1, select1_count;
+
+		assert(!(regs->layout & SI_PC_REG_REVERSE));
+
+		radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude);
+		for (idx = 0; idx < regs->num_prelude; ++idx)
+			radeon_emit(cs, 0);
+		for (idx = 0; idx < count; ++idx)
+			radeon_emit(cs, selectors[idx] | regs->select_or);
+
+		select1 = regs->select0 + 4 * regs->num_counters;
+		select1_count = MIN2(count, regs->num_multi);
+		radeon_set_uconfig_reg_seq(cs, select1, select1_count);
+		for (idx = 0; idx < select1_count; ++idx)
+			radeon_emit(cs, 0);
+	} else if (layout_multi == SI_PC_MULTI_CUSTOM) {
+		unsigned *reg = regs->select;
+		for (idx = 0; idx < count; ++idx) {
+			radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or);
+			if (idx < regs->num_multi)
+				radeon_set_uconfig_reg(cs, *reg++, 0);
+		}
+	} else {
+		assert(layout_multi == SI_PC_MULTI_ALTERNATE);
+
+		unsigned reg_base = regs->select0;
+		unsigned reg_count = count + MIN2(count, regs->num_multi);
+		reg_count += regs->num_prelude;
+
+		if (!(regs->layout & SI_PC_REG_REVERSE)) {
+			radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
+
+			for (idx = 0; idx < regs->num_prelude; ++idx)
+				radeon_emit(cs, 0);
+			for (idx = 0; idx < count; ++idx) {
+				radeon_emit(cs, selectors[idx] | regs->select_or);
+				if (idx < regs->num_multi)
+					radeon_emit(cs, 0);
+			}
+		} else {
+			reg_base -= (reg_count - 1) * 4;
+			radeon_set_uconfig_reg_seq(cs, reg_base, reg_count);
+
+			for (idx = count; idx > 0; --idx) {
+				if (idx <= regs->num_multi)
+					radeon_emit(cs, 0);
+				radeon_emit(cs, selectors[idx - 1] | regs->select_or);
+			}
+			for (idx = 0; idx < regs->num_prelude; ++idx)
+				radeon_emit(cs, 0);
+		}
+	}
+}
+
+static void si_pc_emit_start(struct r600_common_context *ctx,
+			     struct r600_resource *buffer, uint64_t va)
+{
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+
+	radeon_add_to_buffer_list(ctx, &ctx->gfx, buffer,
+				  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+
+	radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+	radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
+			COPY_DATA_DST_SEL(COPY_DATA_MEM));
+	radeon_emit(cs, 1); /* immediate */
+	radeon_emit(cs, 0); /* unused */
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+
+	radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+			       S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET));
+	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_START) | EVENT_INDEX(0));
+	radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+			       S_036020_PERFMON_STATE(V_036020_START_COUNTING));
+}
+
+/* Note: The buffer was already added in si_pc_emit_start, so we don't have to
+ * do it again in here. */
+static void si_pc_emit_stop(struct r600_common_context *ctx,
+			    struct r600_resource *buffer, uint64_t va)
+{
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+
+	if (ctx->screen->chip_class == CIK) {
+		/* Workaround for cache flush problems: send two EOP events. */
+		/* XXX - is this needed given that we don't actually care about
+		 * cache flushes here? */
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) |
+				EVENT_INDEX(5));
+		radeon_emit(cs, va);
+		radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
+		radeon_emit(cs, 0); /* immediate data */
+		radeon_emit(cs, 0); /* unused */
+	}
+
+	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) |
+			EVENT_INDEX(5));
+	radeon_emit(cs, va);
+	radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
+	radeon_emit(cs, 0); /* immediate data */
+	radeon_emit(cs, 0); /* unused */
+
+	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+	radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, 0); /* reference value */
+	radeon_emit(cs, 0xffffffff); /* mask */
+	radeon_emit(cs, 4); /* poll interval */
+
+	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
+	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PERFCOUNTER_STOP) | EVENT_INDEX(0));
+	radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+			       S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) |
+			       S_036020_PERFMON_SAMPLE_ENABLE(1));
+}
+
+static void si_pc_emit_read(struct r600_common_context *ctx,
+			    struct r600_perfcounter_block *group,
+			    unsigned count, unsigned *selectors,
+			    struct r600_resource *buffer, uint64_t va)
+{
+	struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
+	struct si_pc_block_base *regs = sigroup->b;
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	unsigned idx;
+	unsigned reg = regs->counter0_lo;
+	unsigned reg_delta = 8;
+
+	if (regs->layout & SI_PC_REG_REVERSE)
+		reg_delta = -reg_delta;
+
+	for (idx = 0; idx < count; ++idx) {
+		if (regs->counters)
+			reg = regs->counters[idx];
+
+		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
+				COPY_DATA_DST_SEL(COPY_DATA_MEM));
+		radeon_emit(cs, reg >> 2);
+		radeon_emit(cs, 0); /* unused */
+		radeon_emit(cs, va);
+		radeon_emit(cs, va >> 32);
+		va += 4;
+		reg += reg_delta;
+	}
+}
+
+static void si_pc_cleanup(struct r600_common_screen *rscreen)
+{
+	r600_perfcounters_do_destroy(rscreen->perfcounters);
+	rscreen->perfcounters = NULL;
+}
+
+void si_init_perfcounters(struct si_screen *screen)
+{
+	struct r600_perfcounters *pc;
+	struct si_pc_block *blocks;
+	unsigned num_blocks;
+	unsigned i;
+
+	switch (screen->b.chip_class) {
+	case CIK:
+		blocks = groups_CIK;
+		num_blocks = ARRAY_SIZE(groups_CIK);
+		break;
+	case VI:
+		blocks = groups_VI;
+		num_blocks = ARRAY_SIZE(groups_VI);
+		break;
+	case SI:
+	default:
+		return; /* not implemented */
+	}
+
+	if (screen->b.info.max_sh_per_se != 1) {
+		/* This should not happen on non-SI chips. */
+		fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not "
+			"supported (inaccurate performance counters)\n",
+			screen->b.info.max_sh_per_se);
+	}
+
+	pc = CALLOC_STRUCT(r600_perfcounters);
+	if (!pc)
+		return;
+
+	pc->num_start_cs_dwords = 14;
+	pc->num_stop_cs_dwords = 20;
+	pc->num_instance_cs_dwords = 3;
+	pc->num_shaders_cs_dwords = 4;
+
+	if (screen->b.chip_class == CIK) {
+		pc->num_stop_cs_dwords += 6;
+	}
+
+	pc->get_size = si_pc_get_size;
+	pc->emit_instance = si_pc_emit_instance;
+	pc->emit_shaders = si_pc_emit_shaders;
+	pc->emit_select = si_pc_emit_select;
+	pc->emit_start = si_pc_emit_start;
+	pc->emit_stop = si_pc_emit_stop;
+	pc->emit_read = si_pc_emit_read;
+	pc->cleanup = si_pc_cleanup;
+
+	if (!r600_perfcounters_init(pc, num_blocks))
+		goto error;
+
+	for (i = 0; i < num_blocks; ++i) {
+		struct si_pc_block *block = &blocks[i];
+		unsigned instances = block->instances;
+
+		if (!strcmp(block->b->name, "IA")) {
+			if (screen->b.info.max_se > 2)
+				instances = 2;
+		}
+
+		if (!r600_perfcounters_add_block(&screen->b, pc,
+						 block->b->name,
+						 block->b->flags,
+						 block->b->num_counters,
+						 block->selectors,
+						 instances,
+						 block))
+			goto error;
+	}
+
+	screen->b.perfcounters = pc;
+	return;
+
+error:
+	r600_perfcounters_do_destroy(pc);
+}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 9a0fe80..81d809b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -630,6 +630,9 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 		return NULL;
 	}
 
+	if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", FALSE))
+		si_init_perfcounters(sscreen);
+
 	sscreen->b.has_cp_dma = true;
 	sscreen->b.has_streamout = true;
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 05d52fe..834c358 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -346,6 +346,9 @@ void si_need_cs_space(struct si_context *ctx);
 /* si_compute.c */
 void si_init_compute_functions(struct si_context *sctx);
 
+/* si_perfcounters.c */
+void si_init_perfcounters(struct si_screen *screen);
+
 /* si_uvd.c */
 struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
 					       const struct pipe_video_codec *templ);
-- 
2.5.0



More information about the mesa-dev mailing list